All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.orc.impl.RunLengthIntegerWriterV2 Maven / Gradle / Ivy

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.orc.impl;

import java.io.IOException;

/**
 * 

A writer that performs light weight compression over sequence of integers. *

*

There are four types of lightweight integer compression

*
    *
  • SHORT_REPEAT
  • *
  • DIRECT
  • *
  • PATCHED_BASE
  • *
  • DELTA
  • *
*

The description and format for these types are as below: * SHORT_REPEAT: Used for short repeated integer sequences.

*
    *
  • 1 byte header *
      *
    • 2 bits for encoding type
    • *
    • 3 bits for bytes required for repeating value
    • *
    • 3 bits for repeat count (MIN_REPEAT + run length)
    • *
    *
  • *
  • Blob - repeat value (fixed bytes)
  • *
*

* DIRECT: Used for random integer sequences whose number of bit * requirement doesn't vary a lot.

*
    *
  • 2 byte header (1st byte) *
      *
    • 2 bits for encoding type
    • *
    • 5 bits for fixed bit width of values in blob
    • *
    • 1 bit for storing MSB of run length
    • *
  • *
  • 2nd byte *
      *
    • 8 bits for lower run length bits
    • *
    *
  • *
  • Blob - stores the direct values using fixed bit width. The length of the * data blob is (fixed width * run length) bits long
  • *
*

* PATCHED_BASE: Used for random integer sequences whose number of bit * requirement varies beyond a threshold.

*
    *
  • 4 bytes header (1st byte) *
      *
    • 2 bits for encoding type
    • *
    • 5 bits for fixed bit width of values in blob
    • *
    • 1 bit for storing MSB of run length
    • *
  • *
  • 2nd byte *
      *
    • 8 bits for lower run length bits
    • *
  • *
  • 3rd byte *
      *
    • 3 bits for bytes required to encode base value
    • *
    • 5 bits for patch width
    • *
  • *
  • 4th byte *
      *
    • 3 bits for patch gap width
    • *
    • 5 bits for patch length
    • *
    *
  • *
  • Base value - Stored using fixed number of bytes. If MSB is set, base * value is negative else positive. Length of base value is (base width * 8) * bits.
  • *
  • Data blob - Base reduced values as stored using fixed bit width. Length * of data blob is (fixed width * run length) bits.
  • *
  • Patch blob - Patch blob is a list of gap and patch value. Each entry in * the patch list is (patch width + patch gap width) bits long. Gap between the * subsequent elements to be patched are stored in upper part of entry whereas * patch values are stored in lower part of entry. Length of patch blob is * ((patch width + patch gap width) * patch length) bits.
  • *
*

* DELTA Used for monotonically increasing or decreasing sequences, * sequences with fixed delta values or long repeated sequences. *

    *
  • 2 bytes header (1st byte) *
      *
    • 2 bits for encoding type
    • *
    • 5 bits for fixed bit width of values in blob
    • *
    • 1 bit for storing MSB of run length
    • *
  • *
  • 2nd byte *
      *
    • 8 bits for lower run length bits
    • *
  • *
  • Base value - zigzag encoded value written as varint
  • *
  • Delta base - zigzag encoded value written as varint
  • *
  • Delta blob - only positive values. monotonicity and orderness are decided * based on the sign of the base value and delta base
  • *
*/ public class RunLengthIntegerWriterV2 implements IntegerWriter { public enum EncodingType { SHORT_REPEAT, DIRECT, PATCHED_BASE, DELTA } static final int MAX_SCOPE = 512; static final int MIN_REPEAT = 3; private static final int MAX_SHORT_REPEAT_LENGTH = 10; private long prevDelta = 0; private int fixedRunLength = 0; private int variableRunLength = 0; private final long[] literals = new long[MAX_SCOPE]; private final PositionedOutputStream output; private final boolean signed; private EncodingType encoding; private int numLiterals; private final long[] zigzagLiterals = new long[MAX_SCOPE]; private final long[] baseRedLiterals = new long[MAX_SCOPE]; private final long[] adjDeltas = new long[MAX_SCOPE]; private long fixedDelta; private int zzBits90p; private int zzBits100p; private int brBits95p; private int brBits100p; private int bitsDeltaMax; private int patchWidth; private int patchGapWidth; private int patchLength; private long[] gapVsPatchList; private long min; private boolean isFixedDelta; private SerializationUtils utils; private boolean alignedBitpacking; RunLengthIntegerWriterV2(PositionedOutputStream output, boolean signed) { this(output, signed, true); } public RunLengthIntegerWriterV2(PositionedOutputStream output, boolean signed, boolean alignedBitpacking) { this.output = output; this.signed = signed; this.alignedBitpacking = alignedBitpacking; this.utils = new SerializationUtils(); clear(); } private void writeValues() throws IOException { if (numLiterals != 0) { if (encoding.equals(EncodingType.SHORT_REPEAT)) { writeShortRepeatValues(); } else if (encoding.equals(EncodingType.DIRECT)) { writeDirectValues(); } else if (encoding.equals(EncodingType.PATCHED_BASE)) { writePatchedBaseValues(); } else { writeDeltaValues(); } // clear all the variables clear(); } } private void writeDeltaValues() throws IOException { int len = 0; int fb = bitsDeltaMax; int efb = 0; if (alignedBitpacking) { fb = utils.getClosestAlignedFixedBits(fb); } if (isFixedDelta) { // if fixed run length is greater than threshold then it will be fixed // delta sequence with delta value 0 else fixed delta sequence with // non-zero delta value if (fixedRunLength > MIN_REPEAT) { // ex. sequence: 2 2 2 2 2 2 2 2 len = fixedRunLength - 1; fixedRunLength = 0; } else { // ex. sequence: 4 6 8 10 12 14 16 len = variableRunLength - 1; variableRunLength = 0; } } else { // fixed width 0 is used for long repeating values. // sequences that require only 1 bit to encode will have an additional bit if (fb == 1) { fb = 2; } efb = utils.encodeBitWidth(fb); efb = efb << 1; len = variableRunLength - 1; variableRunLength = 0; } // extract the 9th bit of run length final int tailBits = (len & 0x100) >>> 8; // create first byte of the header final int headerFirstByte = getOpcode() | efb | tailBits; // second byte of the header stores the remaining 8 bits of runlength final int headerSecondByte = len & 0xff; // write header output.write(headerFirstByte); output.write(headerSecondByte); // store the first value from zigzag literal array if (signed) { utils.writeVslong(output, literals[0]); } else { utils.writeVulong(output, literals[0]); } if (isFixedDelta) { // if delta is fixed then we don't need to store delta blob utils.writeVslong(output, fixedDelta); } else { // store the first value as delta value using zigzag encoding utils.writeVslong(output, adjDeltas[0]); // adjacent delta values are bit packed. The length of adjDeltas array is // always one less than the number of literals (delta difference for n // elements is n-1). We have already written one element, write the // remaining numLiterals - 2 elements here utils.writeInts(adjDeltas, 1, numLiterals - 2, fb, output); } } private void writePatchedBaseValues() throws IOException { // NOTE: Aligned bit packing cannot be applied for PATCHED_BASE encoding // because patch is applied to MSB bits. For example: If fixed bit width of // base value is 7 bits and if patch is 3 bits, the actual value is // constructed by shifting the patch to left by 7 positions. // actual_value = patch << 7 | base_value // So, if we align base_value then actual_value can not be reconstructed. // write the number of fixed bits required in next 5 bits final int fb = brBits95p; final int efb = utils.encodeBitWidth(fb) << 1; // adjust variable run length, they are one off variableRunLength -= 1; // extract the 9th bit of run length final int tailBits = (variableRunLength & 0x100) >>> 8; // create first byte of the header final int headerFirstByte = getOpcode() | efb | tailBits; // second byte of the header stores the remaining 8 bits of runlength final int headerSecondByte = variableRunLength & 0xff; // if the min value is negative toggle the sign final boolean isNegative = min < 0 ? true : false; if (isNegative) { min = -min; } // find the number of bytes required for base and shift it by 5 bits // to accommodate patch width. The additional bit is used to store the sign // of the base value. final int baseWidth = utils.findClosestNumBits(min) + 1; final int baseBytes = baseWidth % 8 == 0 ? baseWidth / 8 : (baseWidth / 8) + 1; final int bb = (baseBytes - 1) << 5; // if the base value is negative then set MSB to 1 if (isNegative) { min |= (1L << ((baseBytes * 8) - 1)); } // third byte contains 3 bits for number of bytes occupied by base // and 5 bits for patchWidth final int headerThirdByte = bb | utils.encodeBitWidth(patchWidth); // fourth byte contains 3 bits for page gap width and 5 bits for // patch length final int headerFourthByte = (patchGapWidth - 1) << 5 | patchLength; // write header output.write(headerFirstByte); output.write(headerSecondByte); output.write(headerThirdByte); output.write(headerFourthByte); // write the base value using fixed bytes in big endian order for(int i = baseBytes - 1; i >= 0; i--) { byte b = (byte) ((min >>> (i * 8)) & 0xff); output.write(b); } // base reduced literals are bit packed int closestFixedBits = utils.getClosestFixedBits(fb); utils.writeInts(baseRedLiterals, 0, numLiterals, closestFixedBits, output); // write patch list closestFixedBits = utils.getClosestFixedBits(patchGapWidth + patchWidth); utils.writeInts(gapVsPatchList, 0, gapVsPatchList.length, closestFixedBits, output); // reset run length variableRunLength = 0; } /** * Store the opcode in 2 MSB bits * @return opcode */ private int getOpcode() { return encoding.ordinal() << 6; } private void writeDirectValues() throws IOException { // write the number of fixed bits required in next 5 bits int fb = zzBits100p; if (alignedBitpacking) { fb = utils.getClosestAlignedFixedBits(fb); } final int efb = utils.encodeBitWidth(fb) << 1; // adjust variable run length variableRunLength -= 1; // extract the 9th bit of run length final int tailBits = (variableRunLength & 0x100) >>> 8; // create first byte of the header final int headerFirstByte = getOpcode() | efb | tailBits; // second byte of the header stores the remaining 8 bits of runlength final int headerSecondByte = variableRunLength & 0xff; // write header output.write(headerFirstByte); output.write(headerSecondByte); // bit packing the zigzag encoded literals utils.writeInts(zigzagLiterals, 0, numLiterals, fb, output); // reset run length variableRunLength = 0; } private void writeShortRepeatValues() throws IOException { // get the value that is repeating, compute the bits and bytes required long repeatVal = 0; if (signed) { repeatVal = utils.zigzagEncode(literals[0]); } else { repeatVal = literals[0]; } final int numBitsRepeatVal = utils.findClosestNumBits(repeatVal); final int numBytesRepeatVal = numBitsRepeatVal % 8 == 0 ? numBitsRepeatVal >>> 3 : (numBitsRepeatVal >>> 3) + 1; // write encoding type in top 2 bits int header = getOpcode(); // write the number of bytes required for the value header |= ((numBytesRepeatVal - 1) << 3); // write the run length fixedRunLength -= MIN_REPEAT; header |= fixedRunLength; // write the header output.write(header); // write the repeating value in big endian byte order for(int i = numBytesRepeatVal - 1; i >= 0; i--) { int b = (int) ((repeatVal >>> (i * 8)) & 0xff); output.write(b); } fixedRunLength = 0; } private void determineEncoding() { // we need to compute zigzag values for DIRECT encoding if we decide to // break early for delta overflows or for shorter runs computeZigZagLiterals(); zzBits100p = utils.percentileBits(zigzagLiterals, 0, numLiterals, 1.0); // not a big win for shorter runs to determine encoding if (numLiterals <= MIN_REPEAT) { encoding = EncodingType.DIRECT; return; } // DELTA encoding check // for identifying monotonic sequences boolean isIncreasing = true; boolean isDecreasing = true; this.isFixedDelta = true; this.min = literals[0]; long max = literals[0]; final long initialDelta = literals[1] - literals[0]; long currDelta = 0; long deltaMax = 0; this.adjDeltas[0] = initialDelta; for (int i = 1; i < numLiterals; i++) { final long l1 = literals[i]; final long l0 = literals[i - 1]; currDelta = l1 - l0; min = Math.min(min, l1); max = Math.max(max, l1); isIncreasing &= (l0 <= l1); isDecreasing &= (l0 >= l1); isFixedDelta &= (currDelta == initialDelta); if (i > 1) { adjDeltas[i - 1] = Math.abs(currDelta); deltaMax = Math.max(deltaMax, adjDeltas[i - 1]); } } // its faster to exit under delta overflow condition without checking for // PATCHED_BASE condition as encoding using DIRECT is faster and has less // overhead than PATCHED_BASE if (!utils.isSafeSubtract(max, min)) { encoding = EncodingType.DIRECT; return; } // invariant - subtracting any number from any other in the literals after // this point won't overflow // if min is equal to max then the delta is 0, this condition happens for // fixed values run >10 which cannot be encoded with SHORT_REPEAT if (min == max) { assert isFixedDelta : min + "==" + max + ", isFixedDelta cannot be false"; assert currDelta == 0 : min + "==" + max + ", currDelta should be zero"; fixedDelta = 0; encoding = EncodingType.DELTA; return; } if (isFixedDelta) { assert currDelta == initialDelta : "currDelta should be equal to initialDelta for fixed delta encoding"; encoding = EncodingType.DELTA; fixedDelta = currDelta; return; } // if initialDelta is 0 then we cannot delta encode as we cannot identify // the sign of deltas (increasing or decreasing) if (initialDelta != 0) { // stores the number of bits required for packing delta blob in // delta encoding bitsDeltaMax = utils.findClosestNumBits(deltaMax); // monotonic condition if (isIncreasing || isDecreasing) { encoding = EncodingType.DELTA; return; } } // PATCHED_BASE encoding check // percentile values are computed for the zigzag encoded values. if the // number of bit requirement between 90th and 100th percentile varies // beyond a threshold then we need to patch the values. if the variation // is not significant then we can use direct encoding zzBits90p = utils.percentileBits(zigzagLiterals, 0, numLiterals, 0.9); int diffBitsLH = zzBits100p - zzBits90p; // if the difference between 90th percentile and 100th percentile fixed // bits is > 1 then we need patch the values if (diffBitsLH > 1) { // patching is done only on base reduced values. // remove base from literals for (int i = 0; i < numLiterals; i++) { baseRedLiterals[i] = literals[i] - min; } // 95th percentile width is used to determine max allowed value // after which patching will be done brBits95p = utils.percentileBits(baseRedLiterals, 0, numLiterals, 0.95); // 100th percentile is used to compute the max patch width brBits100p = utils.percentileBits(baseRedLiterals, 0, numLiterals, 1.0); // after base reducing the values, if the difference in bits between // 95th percentile and 100th percentile value is zero then there // is no point in patching the values, in which case we will // fallback to DIRECT encoding. // The decision to use patched base was based on zigzag values, but the // actual patching is done on base reduced literals. if ((brBits100p - brBits95p) != 0) { encoding = EncodingType.PATCHED_BASE; preparePatchedBlob(); return; } else { encoding = EncodingType.DIRECT; return; } } else { // if difference in bits between 95th percentile and 100th percentile is // 0, then patch length will become 0. Hence we will fallback to direct encoding = EncodingType.DIRECT; return; } } private void computeZigZagLiterals() { // populate zigzag encoded literals long zzEncVal = 0; for (int i = 0; i < numLiterals; i++) { if (signed) { zzEncVal = utils.zigzagEncode(literals[i]); } else { zzEncVal = literals[i]; } zigzagLiterals[i] = zzEncVal; } } private void preparePatchedBlob() { // mask will be max value beyond which patch will be generated long mask = (1L << brBits95p) - 1; // since we are considering only 95 percentile, the size of gap and // patch array can contain only be 5% values patchLength = (int) Math.ceil((numLiterals * 0.05)); int[] gapList = new int[patchLength]; long[] patchList = new long[patchLength]; // #bit for patch patchWidth = brBits100p - brBits95p; patchWidth = utils.getClosestFixedBits(patchWidth); // if patch bit requirement is 64 then it will not possible to pack // gap and patch together in a long. To make sure gap and patch can be // packed together adjust the patch width if (patchWidth == 64) { patchWidth = 56; brBits95p = 8; mask = (1L << brBits95p) - 1; } int gapIdx = 0; int patchIdx = 0; int prev = 0; int gap = 0; int maxGap = 0; for(int i = 0; i < numLiterals; i++) { // if value is above mask then create the patch and record the gap if (baseRedLiterals[i] > mask) { gap = i - prev; if (gap > maxGap) { maxGap = gap; } // gaps are relative, so store the previous patched value index prev = i; gapList[gapIdx++] = gap; // extract the most significant bits that are over mask bits long patch = baseRedLiterals[i] >>> brBits95p; patchList[patchIdx++] = patch; // strip off the MSB to enable safe bit packing baseRedLiterals[i] &= mask; } } // adjust the patch length to number of entries in gap list patchLength = gapIdx; // if the element to be patched is the first and only element then // max gap will be 0, but to store the gap as 0 we need atleast 1 bit if (maxGap == 0 && patchLength != 0) { patchGapWidth = 1; } else { patchGapWidth = utils.findClosestNumBits(maxGap); } // special case: if the patch gap width is greater than 256, then // we need 9 bits to encode the gap width. But we only have 3 bits in // header to record the gap width. To deal with this case, we will save // two entries in patch list in the following way // 256 gap width => 0 for patch value // actual gap - 256 => actual patch value // We will do the same for gap width = 511. If the element to be patched is // the last element in the scope then gap width will be 511. In this case we // will have 3 entries in the patch list in the following way // 255 gap width => 0 for patch value // 255 gap width => 0 for patch value // 1 gap width => actual patch value if (patchGapWidth > 8) { patchGapWidth = 8; // for gap = 511, we need two additional entries in patch list if (maxGap == 511) { patchLength += 2; } else { patchLength += 1; } } // create gap vs patch list gapIdx = 0; patchIdx = 0; gapVsPatchList = new long[patchLength]; for(int i = 0; i < patchLength; i++) { long g = gapList[gapIdx++]; long p = patchList[patchIdx++]; while (g > 255) { gapVsPatchList[i++] = (255L << patchWidth); g -= 255; } // store patch value in LSBs and gap in MSBs gapVsPatchList[i] = (g << patchWidth) | p; } } /** * clears all the variables */ private void clear() { numLiterals = 0; encoding = null; prevDelta = 0; fixedDelta = 0; zzBits90p = 0; zzBits100p = 0; brBits95p = 0; brBits100p = 0; bitsDeltaMax = 0; patchGapWidth = 0; patchLength = 0; patchWidth = 0; gapVsPatchList = null; min = 0; isFixedDelta = true; } @Override public void flush() throws IOException { if (numLiterals != 0) { if (variableRunLength != 0) { determineEncoding(); writeValues(); } else if (fixedRunLength != 0) { if (fixedRunLength < MIN_REPEAT) { variableRunLength = fixedRunLength; fixedRunLength = 0; determineEncoding(); writeValues(); } else if (fixedRunLength >= MIN_REPEAT && fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) { encoding = EncodingType.SHORT_REPEAT; writeValues(); } else { encoding = EncodingType.DELTA; isFixedDelta = true; writeValues(); } } } output.flush(); } @Override public void write(long val) throws IOException { if (numLiterals == 0) { initializeLiterals(val); } else { if (numLiterals == 1) { prevDelta = val - literals[0]; literals[numLiterals++] = val; // if both values are same count as fixed run else variable run if (val == literals[0]) { fixedRunLength = 2; variableRunLength = 0; } else { fixedRunLength = 0; variableRunLength = 2; } } else { long currentDelta = val - literals[numLiterals - 1]; if (prevDelta == 0 && currentDelta == 0) { // fixed delta run literals[numLiterals++] = val; // if variable run is non-zero then we are seeing repeating // values at the end of variable run in which case keep // updating variable and fixed runs if (variableRunLength > 0) { fixedRunLength = 2; } fixedRunLength += 1; // if fixed run met the minimum condition and if variable // run is non-zero then flush the variable run and shift the // tail fixed runs to start of the buffer if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) { numLiterals -= MIN_REPEAT; variableRunLength -= MIN_REPEAT - 1; // copy the tail fixed runs long[] tailVals = new long[MIN_REPEAT]; System.arraycopy(literals, numLiterals, tailVals, 0, MIN_REPEAT); // determine variable encoding and flush values determineEncoding(); writeValues(); // shift tail fixed runs to beginning of the buffer for(long l : tailVals) { literals[numLiterals++] = l; } } // if fixed runs reached max repeat length then write values if (fixedRunLength == MAX_SCOPE) { determineEncoding(); writeValues(); } } else { // variable delta run // if fixed run length is non-zero and if it satisfies the // short repeat conditions then write the values as short repeats // else use delta encoding if (fixedRunLength >= MIN_REPEAT) { if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) { encoding = EncodingType.SHORT_REPEAT; writeValues(); } else { encoding = EncodingType.DELTA; isFixedDelta = true; writeValues(); } } // if fixed run length is 0 && fixedRunLength < MIN_REPEAT) { if (val != literals[numLiterals - 1]) { variableRunLength = fixedRunLength; fixedRunLength = 0; } } // after writing values re-initialize the variables if (numLiterals == 0) { initializeLiterals(val); } else { // keep updating variable run lengths prevDelta = val - literals[numLiterals - 1]; literals[numLiterals++] = val; variableRunLength += 1; // if variable run length reach the max scope, write it if (variableRunLength == MAX_SCOPE) { determineEncoding(); writeValues(); } } } } } } private void initializeLiterals(long val) { literals[numLiterals++] = val; fixedRunLength = 1; variableRunLength = 1; } @Override public void getPosition(PositionRecorder recorder) throws IOException { output.getPosition(recorder); recorder.addPosition(numLiterals); } @Override public long estimateMemory() { return output.getBufferSize(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy