io.airlift.compress.zstd.SequenceEncoder Maven / Gradle / Ivy
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.airlift.compress.zstd;
import static io.airlift.compress.zstd.Constants.DEFAULT_MAX_OFFSET_CODE_SYMBOL;
import static io.airlift.compress.zstd.Constants.LITERALS_LENGTH_BITS;
import static io.airlift.compress.zstd.Constants.LITERAL_LENGTH_TABLE_LOG;
import static io.airlift.compress.zstd.Constants.LONG_NUMBER_OF_SEQUENCES;
import static io.airlift.compress.zstd.Constants.MATCH_LENGTH_BITS;
import static io.airlift.compress.zstd.Constants.MATCH_LENGTH_TABLE_LOG;
import static io.airlift.compress.zstd.Constants.MAX_LITERALS_LENGTH_SYMBOL;
import static io.airlift.compress.zstd.Constants.MAX_MATCH_LENGTH_SYMBOL;
import static io.airlift.compress.zstd.Constants.MAX_OFFSET_CODE_SYMBOL;
import static io.airlift.compress.zstd.Constants.OFFSET_TABLE_LOG;
import static io.airlift.compress.zstd.Constants.SEQUENCE_ENCODING_BASIC;
import static io.airlift.compress.zstd.Constants.SEQUENCE_ENCODING_COMPRESSED;
import static io.airlift.compress.zstd.Constants.SEQUENCE_ENCODING_RLE;
import static io.airlift.compress.zstd.Constants.SIZE_OF_SHORT;
import static io.airlift.compress.zstd.FiniteStateEntropy.optimalTableLog;
import static io.airlift.compress.zstd.UnsafeUtil.UNSAFE;
import static io.airlift.compress.zstd.Util.checkArgument;
class SequenceEncoder
{
private static final int DEFAULT_LITERAL_LENGTH_NORMALIZED_COUNTS_LOG = 6;
private static final short[] DEFAULT_LITERAL_LENGTH_NORMALIZED_COUNTS = {4, 3, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2,
2, 3, 2, 1, 1, 1, 1, 1,
-1, -1, -1, -1};
private static final int DEFAULT_MATCH_LENGTH_NORMALIZED_COUNTS_LOG = 6;
private static final short[] DEFAULT_MATCH_LENGTH_NORMALIZED_COUNTS = {1, 4, 3, 2, 2, 2, 2, 2,
2, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, -1, -1,
-1, -1, -1, -1, -1};
private static final int DEFAULT_OFFSET_NORMALIZED_COUNTS_LOG = 5;
private static final short[] DEFAULT_OFFSET_NORMALIZED_COUNTS = {1, 1, 1, 1, 1, 1, 2, 2,
2, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1,
-1, -1, -1, -1, -1};
private static final FseCompressionTable DEFAULT_LITERAL_LENGTHS_TABLE = FseCompressionTable.newInstance(DEFAULT_LITERAL_LENGTH_NORMALIZED_COUNTS, MAX_LITERALS_LENGTH_SYMBOL, DEFAULT_LITERAL_LENGTH_NORMALIZED_COUNTS_LOG);
private static final FseCompressionTable DEFAULT_MATCH_LENGTHS_TABLE = FseCompressionTable.newInstance(DEFAULT_MATCH_LENGTH_NORMALIZED_COUNTS, MAX_MATCH_LENGTH_SYMBOL, DEFAULT_LITERAL_LENGTH_NORMALIZED_COUNTS_LOG);
private static final FseCompressionTable DEFAULT_OFFSETS_TABLE = FseCompressionTable.newInstance(DEFAULT_OFFSET_NORMALIZED_COUNTS, DEFAULT_MAX_OFFSET_CODE_SYMBOL, DEFAULT_OFFSET_NORMALIZED_COUNTS_LOG);
private SequenceEncoder()
{
}
public static int compressSequences(Object outputBase, final long outputAddress, int outputSize, SequenceStore sequences, CompressionParameters.Strategy strategy, SequenceEncodingContext workspace)
{
long output = outputAddress;
long outputLimit = outputAddress + outputSize;
checkArgument(outputLimit - output > 3 /* max sequence count Size */ + 1 /* encoding type flags */, "Output buffer too small");
int sequenceCount = sequences.sequenceCount;
if (sequenceCount < 0x7F) {
UNSAFE.putByte(outputBase, output, (byte) sequenceCount);
output++;
}
else if (sequenceCount < LONG_NUMBER_OF_SEQUENCES) {
UNSAFE.putByte(outputBase, output, (byte) (sequenceCount >>> 8 | 0x80));
UNSAFE.putByte(outputBase, output + 1, (byte) sequenceCount);
output += SIZE_OF_SHORT;
}
else {
UNSAFE.putByte(outputBase, output, (byte) 0xFF);
output++;
UNSAFE.putShort(outputBase, output, (short) (sequenceCount - LONG_NUMBER_OF_SEQUENCES));
output += SIZE_OF_SHORT;
}
if (sequenceCount == 0) {
return (int) (output - outputAddress);
}
// flags for FSE encoding type
long headerAddress = output++;
int maxSymbol;
int largestCount;
// literal lengths
int[] counts = workspace.counts;
Histogram.count(sequences.literalLengthCodes, sequenceCount, workspace.counts);
maxSymbol = Histogram.findMaxSymbol(counts, MAX_LITERALS_LENGTH_SYMBOL);
largestCount = Histogram.findLargestCount(counts, maxSymbol);
int literalsLengthEncodingType = selectEncodingType(largestCount, sequenceCount, DEFAULT_LITERAL_LENGTH_NORMALIZED_COUNTS_LOG, true, strategy);
FseCompressionTable literalLengthTable;
switch (literalsLengthEncodingType) {
case SEQUENCE_ENCODING_RLE:
UNSAFE.putByte(outputBase, output, sequences.literalLengthCodes[0]);
output++;
workspace.literalLengthTable.initializeRleTable(maxSymbol);
literalLengthTable = workspace.literalLengthTable;
break;
case SEQUENCE_ENCODING_BASIC:
literalLengthTable = DEFAULT_LITERAL_LENGTHS_TABLE;
break;
case SEQUENCE_ENCODING_COMPRESSED:
output += buildCompressionTable(
workspace.literalLengthTable,
outputBase,
output,
outputLimit,
sequenceCount,
LITERAL_LENGTH_TABLE_LOG,
sequences.literalLengthCodes,
workspace.counts,
maxSymbol,
workspace.normalizedCounts);
literalLengthTable = workspace.literalLengthTable;
break;
default:
throw new UnsupportedOperationException("not yet implemented");
}
// offsets
Histogram.count(sequences.offsetCodes, sequenceCount, workspace.counts);
maxSymbol = Histogram.findMaxSymbol(counts, MAX_OFFSET_CODE_SYMBOL);
largestCount = Histogram.findLargestCount(counts, maxSymbol);
// We can only use the basic table if max <= DEFAULT_MAX_OFFSET_CODE_SYMBOL, otherwise the offsets are too large .
boolean defaultAllowed = maxSymbol < DEFAULT_MAX_OFFSET_CODE_SYMBOL;
int offsetEncodingType = selectEncodingType(largestCount, sequenceCount, DEFAULT_OFFSET_NORMALIZED_COUNTS_LOG, defaultAllowed, strategy);
FseCompressionTable offsetCodeTable;
switch (offsetEncodingType) {
case SEQUENCE_ENCODING_RLE:
UNSAFE.putByte(outputBase, output, sequences.offsetCodes[0]);
output++;
workspace.offsetCodeTable.initializeRleTable(maxSymbol);
offsetCodeTable = workspace.offsetCodeTable;
break;
case SEQUENCE_ENCODING_BASIC:
offsetCodeTable = DEFAULT_OFFSETS_TABLE;
break;
case SEQUENCE_ENCODING_COMPRESSED:
output += buildCompressionTable(
workspace.offsetCodeTable,
outputBase,
output,
output + outputSize,
sequenceCount,
OFFSET_TABLE_LOG,
sequences.offsetCodes,
workspace.counts,
maxSymbol,
workspace.normalizedCounts);
offsetCodeTable = workspace.offsetCodeTable;
break;
default:
throw new UnsupportedOperationException("not yet implemented");
}
// match lengths
Histogram.count(sequences.matchLengthCodes, sequenceCount, workspace.counts);
maxSymbol = Histogram.findMaxSymbol(counts, MAX_MATCH_LENGTH_SYMBOL);
largestCount = Histogram.findLargestCount(counts, maxSymbol);
int matchLengthEncodingType = selectEncodingType(largestCount, sequenceCount, DEFAULT_MATCH_LENGTH_NORMALIZED_COUNTS_LOG, true, strategy);
FseCompressionTable matchLengthTable;
switch (matchLengthEncodingType) {
case SEQUENCE_ENCODING_RLE:
UNSAFE.putByte(outputBase, output, sequences.matchLengthCodes[0]);
output++;
workspace.matchLengthTable.initializeRleTable(maxSymbol);
matchLengthTable = workspace.matchLengthTable;
break;
case SEQUENCE_ENCODING_BASIC:
matchLengthTable = DEFAULT_MATCH_LENGTHS_TABLE;
break;
case SEQUENCE_ENCODING_COMPRESSED:
output += buildCompressionTable(
workspace.matchLengthTable,
outputBase,
output,
outputLimit,
sequenceCount,
MATCH_LENGTH_TABLE_LOG,
sequences.matchLengthCodes,
workspace.counts,
maxSymbol,
workspace.normalizedCounts);
matchLengthTable = workspace.matchLengthTable;
break;
default:
throw new UnsupportedOperationException("not yet implemented");
}
// flags
UNSAFE.putByte(outputBase, headerAddress, (byte) ((literalsLengthEncodingType << 6) | (offsetEncodingType << 4) | (matchLengthEncodingType << 2)));
output += encodeSequences(outputBase, output, outputLimit, matchLengthTable, offsetCodeTable, literalLengthTable, sequences);
return (int) (output - outputAddress);
}
private static int buildCompressionTable(FseCompressionTable table, Object outputBase, long output, long outputLimit, int sequenceCount, int maxTableLog, byte[] codes, int[] counts, int maxSymbol, short[] normalizedCounts)
{
int tableLog = optimalTableLog(maxTableLog, sequenceCount, maxSymbol);
// this is a minor optimization. The last symbol is embedded in the initial FSE state, so it's not part of the bitstream. We can omit it from the
// statistics (but only if its count is > 1). This makes the statistics a tiny bit more accurate.
if (counts[codes[sequenceCount - 1]] > 1) {
counts[codes[sequenceCount - 1]]--;
sequenceCount--;
}
FiniteStateEntropy.normalizeCounts(normalizedCounts, tableLog, counts, sequenceCount, maxSymbol);
table.initialize(normalizedCounts, maxSymbol, tableLog);
return FiniteStateEntropy.writeNormalizedCounts(outputBase, output, (int) (outputLimit - output), normalizedCounts, maxSymbol, tableLog); // TODO: pass outputLimit directly
}
private static int encodeSequences(
Object outputBase,
long output,
long outputLimit,
FseCompressionTable matchLengthTable,
FseCompressionTable offsetsTable,
FseCompressionTable literalLengthTable,
SequenceStore sequences)
{
byte[] matchLengthCodes = sequences.matchLengthCodes;
byte[] offsetCodes = sequences.offsetCodes;
byte[] literalLengthCodes = sequences.literalLengthCodes;
BitOutputStream blockStream = new BitOutputStream(outputBase, output, (int) (outputLimit - output));
int sequenceCount = sequences.sequenceCount;
// first symbols
int matchLengthState = matchLengthTable.begin(matchLengthCodes[sequenceCount - 1]);
int offsetState = offsetsTable.begin(offsetCodes[sequenceCount - 1]);
int literalLengthState = literalLengthTable.begin(literalLengthCodes[sequenceCount - 1]);
blockStream.addBits(sequences.literalLengths[sequenceCount - 1], LITERALS_LENGTH_BITS[literalLengthCodes[sequenceCount - 1]]);
blockStream.addBits(sequences.matchLengths[sequenceCount - 1], MATCH_LENGTH_BITS[matchLengthCodes[sequenceCount - 1]]);
blockStream.addBits(sequences.offsets[sequenceCount - 1], offsetCodes[sequenceCount - 1]);
blockStream.flush();
if (sequenceCount >= 2) {
for (int n = sequenceCount - 2; n >= 0; n--) {
byte literalLengthCode = literalLengthCodes[n];
byte offsetCode = offsetCodes[n];
byte matchLengthCode = matchLengthCodes[n];
int literalLengthBits = LITERALS_LENGTH_BITS[literalLengthCode];
int offsetBits = offsetCode;
int matchLengthBits = MATCH_LENGTH_BITS[matchLengthCode];
// (7)
offsetState = offsetsTable.encode(blockStream, offsetState, offsetCode); // 15
matchLengthState = matchLengthTable.encode(blockStream, matchLengthState, matchLengthCode); // 24
literalLengthState = literalLengthTable.encode(blockStream, literalLengthState, literalLengthCode); // 33
if ((offsetBits + matchLengthBits + literalLengthBits >= 64 - 7 - (LITERAL_LENGTH_TABLE_LOG + MATCH_LENGTH_TABLE_LOG + OFFSET_TABLE_LOG))) {
blockStream.flush(); /* (7)*/
}
blockStream.addBits(sequences.literalLengths[n], literalLengthBits);
if (((literalLengthBits + matchLengthBits) > 24)) {
blockStream.flush();
}
blockStream.addBits(sequences.matchLengths[n], matchLengthBits);
if ((offsetBits + matchLengthBits + literalLengthBits > 56)) {
blockStream.flush();
}
blockStream.addBits(sequences.offsets[n], offsetBits); // 31
blockStream.flush(); // (7)
}
}
matchLengthTable.finish(blockStream, matchLengthState);
offsetsTable.finish(blockStream, offsetState);
literalLengthTable.finish(blockStream, literalLengthState);
int streamSize = blockStream.close();
checkArgument(streamSize > 0, "Output buffer too small");
return streamSize;
}
private static int selectEncodingType(
int largestCount,
int sequenceCount,
int defaultNormalizedCountsLog,
boolean isDefaultTableAllowed,
CompressionParameters.Strategy strategy)
{
if (largestCount == sequenceCount) { // => all entries are equal
if (isDefaultTableAllowed && sequenceCount <= 2) {
/* Prefer set_basic over set_rle when there are 2 or fewer symbols,
* since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol.
* If basic encoding isn't possible, always choose RLE.
*/
return SEQUENCE_ENCODING_BASIC;
}
return SEQUENCE_ENCODING_RLE;
}
if (strategy.ordinal() < CompressionParameters.Strategy.LAZY.ordinal()) { // TODO: more robust check. Maybe encapsulate in strategy objects
if (isDefaultTableAllowed) {
int factor = 10 - strategy.ordinal(); // TODO more robust. Move it to strategy
int baseLog = 3;
long minNumberOfSequences = ((1L << defaultNormalizedCountsLog) * factor) >> baseLog; /* 28-36 for offset, 56-72 for lengths */
if ((sequenceCount < minNumberOfSequences) || (largestCount < (sequenceCount >> (defaultNormalizedCountsLog - 1)))) {
/* The format allows default tables to be repeated, but it isn't useful.
* When using simple heuristics to select encoding type, we don't want
* to confuse these tables with dictionaries. When running more careful
* analysis, we don't need to waste time checking both repeating tables
* and default tables.
*/
return SEQUENCE_ENCODING_BASIC;
}
}
}
else {
// TODO implement when other strategies are supported
throw new UnsupportedOperationException("not yet implemented");
}
return SEQUENCE_ENCODING_COMPRESSED;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy