src.it.unimi.dsi.compression.HuTuckerCodec Maven / Gradle / Ivy
Show all versions of dsiutils Show documentation
/*
* DSI utilities
*
* Copyright (C) 2005-2020 Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see .
*
*/
package it.unimi.dsi.compression;
import java.io.Serializable;
import java.util.Arrays;
import it.unimi.dsi.bits.BitVector;
/** An implementation of the Hu–Tucker optimal lexicographical prefix-free code.
*
* The familiar Huffman coding technique can be extended so to preserve the order in which
* symbols are given to the coder, in the sense that if j<k, then the
* j-th symbol will get a code lexicographically smaller than the one
* assigned to the k-th symbol. This result can be obtained with a small loss in
* code length (for more details, see the third volume of The Art of Computer Programming).
*
*
A Hu–Tucker coder is built given an array of frequencies corresponding to each
* symbol. Frequency 0 symbols are allowed, but they will degrade the resulting code.
*
*
The implementation of this class is rather inefficient, and the time required to build
* a Hu–Tucker code is quadratic in the number of symbols.
* An O(n log n) implementation
* is possible, but it requires very sophisticated data structures.
*/
public class HuTuckerCodec implements PrefixCodec, Serializable {
private static final boolean DEBUG = false;
private static final long serialVersionUID = 2L;
/** The number of symbols of this coder. */
public final int size;
/** The root of the decoding tree. */
private final TreeDecoder.Node root;
/** A cached singleton instance of the coder of this codec. */
private final CodeWordCoder coder;
/** A cached singleton instance of the decoder of this codec. */
private final TreeDecoder decoder;
/** A node to be used for the tree construction: it records both the level and the index. */
private static final class LevelNode extends TreeDecoder.LeafNode {
private static final long serialVersionUID = 1L;
int level;
private LevelNode(final int symbol) {
super(symbol);
}
private LevelNode() {
super(-1);
}
}
private static long[] intArray2LongArray(final int a[]) {
final long[] b = new long[a.length];
for(int i = a.length; i-- != 0;) b[i] = a[i];
return b;
}
public HuTuckerCodec(final int[] frequency) {
this(intArray2LongArray(frequency));
}
public HuTuckerCodec(final long[] frequency) {
size = frequency.length;
final boolean[] internal = new boolean[size];
final boolean[] removed = new boolean[size];
final long[] compoundFrequency = new long[size];
final LevelNode[] externalNode = new LevelNode[size], node = new LevelNode[size];
long currPri;
int first, last, left, right, minLeft, minRight;
LevelNode n;
// We create a node with level information for each symbol
for(int i = size; i-- != 0;) {
compoundFrequency[i] = frequency[i];
node[i] = externalNode[i] = new LevelNode(i);
}
first = 0;
last = size - 1;
minLeft = 0;
int currMinLeft;
// First selection phase (see Knuth)
for(int i = size; --i != 0;) {
currMinLeft = minLeft = minRight = -1;
currPri = Long.MAX_VALUE;
while(removed[first]) first++;
while(removed[last]) last--;
right = first;
assert right < last;
while(right < last) {
left = currMinLeft = right;
do {
right++;
if (! removed[right]) {
if (compoundFrequency[currMinLeft] + compoundFrequency[right] < currPri) {
currPri = compoundFrequency[currMinLeft] + compoundFrequency[right];
minLeft = currMinLeft;
minRight = right;
}
if (compoundFrequency[right] < compoundFrequency[currMinLeft]) currMinLeft = right;
}
} while((removed[right] || internal[right]) && right < last);
assert right == last || (! removed[right] && ! internal[right]);
assert left < right;
}
internal[minLeft] = true;
removed[minRight] = true;
n = new LevelNode();
n.left = node[minLeft];
n.right = node[minRight];
node[minLeft] = n;
compoundFrequency[minLeft] += compoundFrequency[minRight];
}
// Recursive marking
markRec(node[minLeft], 0);
// We now restart the aggregation process
Arrays.fill(removed, false);
System.arraycopy(externalNode, 0, node, 0, size);
int currLevel, leftLevel;
first = 0;
minLeft = 0;
last = size - 1;
for(int i = size; --i != 0;) {
while(removed[first]) first++;
while(removed[last]) last--;
left = first;
currLevel = minLeft = minRight = -1;
while(left < last) {
leftLevel = node[left].level;
assert leftLevel > currLevel;
for(right = left + 1; right <= last && removed[right]; right++);
assert right <= last;
assert ! removed[right];
if (leftLevel == node[right].level) {
currLevel = leftLevel;
minLeft = left;
minRight = right;
}
do left++; while(left < last && (removed[left] || node[left].level <= currLevel));
}
removed[minRight] = true;
n = new LevelNode();
n.left = node[minLeft];
n.right = node[minRight];
n.level = currLevel - 1;
node[minLeft] = n;
}
root = rebuildTree(node[minLeft]);
decoder = new TreeDecoder(root, size);
coder = new CodeWordCoder(decoder.buildCodes());
if (DEBUG) {
final BitVector[] codeWord = coder.codeWords();
System.err.println("Codes: ");
for(int i = 0; i < size; i++)
System.err.println(i + " (" + codeWord[i].length() + " bits): " + codeWord[i]);
long totFreq = 0;
for(int i = size; i-- != 0;) totFreq += frequency[i];
long totBits = 0;
for(int i = size; i-- != 0;) totBits += frequency[i] * codeWord[i].length();
System.err.println("Compression: " + totBits + " / " + totFreq * Character.SIZE + " = " + (double)totBits/(totFreq * Character.SIZE));
}
}
/** We scan recursively the tree, making a copy that uses lightweight nodes. */
private TreeDecoder.Node rebuildTree(final LevelNode n) {
if (n == null) return null;
if (n.symbol != -1) return new TreeDecoder.LeafNode(n.symbol);
final TreeDecoder.Node newNode = new TreeDecoder.Node();
newNode.left = rebuildTree((LevelNode) n.left);
newNode.right = rebuildTree((LevelNode) n.right);
return newNode;
}
/** Mark recursively the height of each node. */
private void markRec(final LevelNode n, final int height) {
if (n == null) return;
n.level = height;
markRec((LevelNode) n.left, height + 1);
markRec((LevelNode) n.right, height + 1);
}
@Override
public CodeWordCoder coder() {
return coder;
}
@Override
public Decoder decoder() {
return decoder;
}
@Override
public int size() {
return size;
}
@Override
public BitVector[] codeWords() {
return coder.codeWords();
}
@Deprecated
public PrefixCoder getCoder() { return coder(); }
@Deprecated
public Decoder getDecoder() { return decoder(); }
}