All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.it.unimi.dsi.compression.HuTuckerCodec Maven / Gradle / Ivy

package it.unimi.dsi.compression;

/*
 * DSI utilities
 *
 * Copyright (C) 2005-2017 Sebastiano Vigna
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import it.unimi.dsi.bits.BitVector;

import java.io.Serializable;
import java.util.Arrays;

/** An implementation of the Hu–Tucker optimal lexicographical prefix-free code.
 *
 * 

The familiar Huffman coding technique can be extended so to preserve the order in which * symbols are given to the coder, in the sense that if j<k, then the * j-th symbol will get a code lexicographically smaller than the one * assigned to the k-th symbol. This result can be obtained with a small loss in * code length (for more details, see the third volume of The Art of Computer Programming). * *

A Hu–Tucker coder is built given an array of frequencies corresponding to each * symbol. Frequency 0 symbols are allowed, but they will degrade the resulting code. * *

The implementation of this class is rather inefficient, and the time required to build * a Hu–Tucker code is quadratic in the number of symbols. * An O(n log n) implementation * is possible, but it requires very sophisticated data structures. */ public class HuTuckerCodec implements PrefixCodec, Serializable { private static final boolean DEBUG = false; private static final long serialVersionUID = 2L; /** The number of symbols of this coder. */ public final int size; /** The root of the decoding tree. */ private final TreeDecoder.Node root; /** A cached singleton instance of the coder of this codec. */ private final CodeWordCoder coder; /** A cached singleton instance of the decoder of this codec. */ private final TreeDecoder decoder; /** A node to be used for the tree construction: it records both the level and the index. */ private static final class LevelNode extends TreeDecoder.LeafNode { private static final long serialVersionUID = 1L; int level; private LevelNode(final int symbol) { super(symbol); } private LevelNode() { super(-1); } } private static long[] intArray2LongArray(final int a[]) { final long[] b = new long[a.length]; for(int i = a.length; i-- != 0;) b[i] = a[i]; return b; } public HuTuckerCodec(final int[] frequency) { this(intArray2LongArray(frequency)); } public HuTuckerCodec(final long[] frequency) { size = frequency.length; final boolean[] internal = new boolean[size]; final boolean[] removed = new boolean[size]; final long[] compoundFrequency = new long[size]; final LevelNode[] externalNode = new LevelNode[size], node = new LevelNode[size]; long currPri; int first, last, left, right, minLeft, minRight; LevelNode n; // We create a node with level information for each symbol for(int i = size; i-- != 0;) { compoundFrequency[i] = frequency[i]; node[i] = externalNode[i] = new LevelNode(i); } first = 0; last = size - 1; minLeft = 0; int currMinLeft; // First selection phase (see Knuth) for(int i = size; --i != 0;) { currMinLeft = minLeft = minRight = -1; currPri = Long.MAX_VALUE; while(removed[first]) first++; while(removed[last]) last--; right = first; assert right < last; while(right < last) { left = currMinLeft = right; do { right++; if (! removed[right]) { if (compoundFrequency[currMinLeft] + compoundFrequency[right] < currPri) { currPri = compoundFrequency[currMinLeft] + compoundFrequency[right]; minLeft = currMinLeft; minRight = right; } if (compoundFrequency[right] < compoundFrequency[currMinLeft]) currMinLeft = right; } } while((removed[right] || internal[right]) && right < last); assert right == last || (! removed[right] && ! internal[right]); assert left < right; } internal[minLeft] = true; removed[minRight] = true; n = new LevelNode(); n.left = node[minLeft]; n.right = node[minRight]; node[minLeft] = n; compoundFrequency[minLeft] += compoundFrequency[minRight]; } // Recursive marking markRec(node[minLeft], 0); // We now restart the aggregation process Arrays.fill(removed, false); System.arraycopy(externalNode, 0, node, 0, size); int currLevel, leftLevel; first = 0; minLeft = 0; last = size - 1; for(int i = size; --i != 0;) { while(removed[first]) first++; while(removed[last]) last--; left = first; currLevel = minLeft = minRight = -1; while(left < last) { leftLevel = node[left].level; assert leftLevel > currLevel; for(right = left + 1; right <= last && removed[right]; right++); assert right <= last; assert ! removed[right]; if (leftLevel == node[right].level) { currLevel = leftLevel; minLeft = left; minRight = right; } do left++; while(left < last && (removed[left] || node[left].level <= currLevel)); } removed[minRight] = true; n = new LevelNode(); n.left = node[minLeft]; n.right = node[minRight]; n.level = currLevel - 1; node[minLeft] = n; } root = rebuildTree(node[minLeft]); decoder = new TreeDecoder(root, size); coder = new CodeWordCoder(decoder.buildCodes()); if (DEBUG) { final BitVector[] codeWord = coder.codeWords(); System.err.println("Codes: "); for(int i = 0; i < size; i++) System.err.println(i + " (" + codeWord[i].length() + " bits): " + codeWord[i]); long totFreq = 0; for(int i = size; i-- != 0;) totFreq += frequency[i]; long totBits = 0; for(int i = size; i-- != 0;) totBits += frequency[i] * codeWord[i].length(); System.err.println("Compression: " + totBits + " / " + totFreq * Character.SIZE + " = " + (double)totBits/(totFreq * Character.SIZE)); } } /** We scan recursively the tree, making a copy that uses lightweight nodes. */ private TreeDecoder.Node rebuildTree(final LevelNode n) { if (n == null) return null; if (n.symbol != -1) return new TreeDecoder.LeafNode(n.symbol); TreeDecoder.Node newNode = new TreeDecoder.Node(); newNode.left = rebuildTree((LevelNode) n.left); newNode.right = rebuildTree((LevelNode) n.right); return newNode; } /** Mark recursively the height of each node. */ private void markRec(final LevelNode n, final int height) { if (n == null) return; n.level = height; markRec((LevelNode) n.left, height + 1); markRec((LevelNode) n.right, height + 1); } @Override public CodeWordCoder coder() { return coder; } @Override public Decoder decoder() { return decoder; } @Override public int size() { return size; } @Override public BitVector[] codeWords() { return coder.codeWords(); } @Deprecated public PrefixCoder getCoder() { return coder(); } @Deprecated public Decoder getDecoder() { return decoder(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy