src.it.unimi.dsi.compression.HuffmanCodec Maven / Gradle / Ivy
package it.unimi.dsi.compression;
import java.io.Serializable;
import java.util.Arrays;
/*
* DSI utilities
*
* Copyright (C) 2005-2017 Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see .
*
*/
import it.unimi.dsi.bits.BitVector;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.fastutil.ints.IntArrays;
/** An implementation of Huffman optimal prefix-free coding.
*
* A Huffman coder is built starting from an array of frequencies corresponding to each
* symbol. Frequency 0 symbols are allowed, but they will degrade the resulting code.
*
*
Instances of this class compute a canonical Huffman code
* (Eugene S. Schwartz and Bruce Kallick, “Generating a Canonical Prefix Encoding”, Commun. ACM 7(3), pages 166−169, 1964), which can
* by {@linkplain CanonicalFast64CodeWordDecoder quickly decoded using table lookups}.
* The construction uses the most efficient one-pass in-place codelength computation procedure
* described by Alistair Moffat and Jyrki Katajainen in “In-Place Calculation of Minimum-Redundancy Codes”,
* Algorithms and Data Structures, 4th International Workshop,
* number 955 in Lecture Notes in Computer Science, pages 393−402, Springer-Verlag, 1995.
*
*
We note by passing that this coded uses a {@link CanonicalFast64CodeWordDecoder}, which does not support codelengths above 64.
* However, since the worst case for codelengths is given by Fibonacci numbers, and frequencies are to be provided as integers,
* no codeword longer than the base-[(51/2 + 1)/2] logarithm of 51/2 · 231 (less than 47) will ever be generated. */
public class HuffmanCodec implements PrefixCodec, Serializable {
private static final boolean DEBUG = false;
private static final boolean ASSERTS = false;
private static final long serialVersionUID = 2L;
/** The number of symbols of this coder. */
public final int size;
/** The codewords for this coder. */
private final BitVector[] codeWord;
/** A cached singleton instance of the coder of this codec. */
private final Fast64CodeWordCoder coder;
/** A cached singleton instance of the decoder of this codec. */
private final CanonicalFast64CodeWordDecoder decoder;
private static long[] intArray2LongArray(final int a[]) {
final long[] b = new long[a.length];
for(int i = a.length; i-- != 0;) b[i] = a[i];
return b;
}
/** Creates a new Huffman codec using the given vector of frequencies.
*
* @param frequency a vector of nonnnegative frequencies.
*/
public HuffmanCodec(final int[] frequency) {
this(intArray2LongArray(frequency));
}
/** Creates a new Huffman codec using the given vector of frequencies.
*
* @param frequency a vector of nonnnegative frequencies.
*/
public HuffmanCodec(final long[] frequency) {
size = frequency.length;
if (size == 0 || size == 1) {
codeWord = new BitVector[size];
if (size == 1) codeWord[0] = LongArrayBitVector.getInstance();
coder = new Fast64CodeWordCoder(codeWord, new long[size]);
decoder = new CanonicalFast64CodeWordDecoder(new int[size], new int[size]);
return;
}
final long[] a = new long[size];
for(int i = size; i-- != 0;) a[i] = frequency[i];
// Sort frequencies (this is the only n log n step).
Arrays.sort(a);
// The following lines are from Moffat & Katajainen sample code. Please refer to their paper.
// First pass, left to right, setting parent pointers.
a[0] += a[1];
int root = 0;
int leaf = 2;
for (int next = 1; next < size - 1; next++) {
// Select first item for a pairing.
if (leaf >= size || a[root] < a[leaf]) {
a[next] = a[root];
a[root++] = next;
}
else a[next] = a[leaf++];
// Add on the second item.
if (leaf >= size || (root < next && a[root] < a[leaf])) {
a[next] += a[root];
a[root++] = next;
}
else a[next] += a[leaf++];
}
// Second pass, right to left, setting internal depths.
a[size - 2] = 0;
for (int next = size - 3; next >= 0; next--) a[next] = a[(int)a[next]] + 1;
// Third pass, right to left, setting leaf depths.
int available = 1, used = 0, depth = 0;
root = size - 2;
int next = size - 1;
while (available > 0) {
while (root >= 0 && a[root] == depth) {
used++;
root--;
}
while (available > used) {
a[next--] = depth;
available--;
}
available = 2 * used;
depth++;
used = 0;
}
// Reverse the order of symbol lengths, and store them into an int array.
final int[] length = new int[size];
for(int i = size; i-- != 0;) length[i] = (int)a[size - 1 - i];
// Sort symbols indices by decreasing frequencies (so symbols correspond to lengths).
final int[] symbol = new int[size];
for(int i = size; i-- != 0;) symbol[i] = i;
IntArrays.quickSort(symbol, 0, size, (x,y) -> Long.compare(frequency[y], frequency[x]));
// Assign codewords (just for the coder--the decoder needs just the lengths).
int s = symbol[0];
int l = length[0];
long value = 0;
BitVector v;
codeWord = new BitVector[size];
final long[] longCodeWord = new long[size];
codeWord[s] = LongArrayBitVector.getInstance().length(l);
for(int i = 1; i < size; i++) {
s = symbol[i];
if (length[i] == l) value++;
else {
value++;
value <<= length[i] - l;
if (ASSERTS) assert length[i] > l;
l = length[i];
}
v = LongArrayBitVector.getInstance().length(l);
for(int j = l; j-- != 0;) if ((1L << j & value) != 0) v.set(l - 1 - j);
codeWord[s] = v;
longCodeWord[s] = value;
}
coder = new Fast64CodeWordCoder(codeWord, longCodeWord);
decoder = new CanonicalFast64CodeWordDecoder(length, symbol);
if (DEBUG) {
final BitVector[] codeWord = codeWords();
System.err.println("Codes: ");
for(int i = 0; i < size; i++)
System.err.println(i + " (" + codeWord[i].length() + " bits): " + codeWord[i]);
long totFreq = 0;
for(int i = size; i-- != 0;) totFreq += frequency[i];
long totBits = 0;
for(int i = size; i-- != 0;) totBits += frequency[i] * codeWord[i].length();
System.err.println("Compression: " + totBits + " / " + totFreq * Character.SIZE + " = " + (double)totBits/(totFreq * Character.SIZE));
}
}
@Override
public CodeWordCoder coder() {
return coder;
}
@Override
public Decoder decoder() {
return decoder;
}
@Override
public int size() {
return size;
}
@Override
public BitVector[] codeWords() {
return coder.codeWords();
}
}