src.it.unimi.dsi.compression.HuffmanCodec Maven / Gradle / Ivy

Go to download
package it.unimi.dsi.compression;

import java.io.Serializable;
import java.util.Arrays;

/*
 * DSI utilities
 *
 * Copyright (C) 2005-2017 Sebastiano Vigna
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import it.unimi.dsi.bits.BitVector;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.fastutil.ints.IntArrays;

/** An implementation of Huffman optimal prefix-free coding.
 *
 * A Huffman coder is built starting from an array of frequencies corresponding to each
 * symbol. Frequency 0 symbols are allowed, but they will degrade the resulting code.
 *
 * 
Instances of this class compute a canonical Huffman code
 * (Eugene S. Schwartz and Bruce Kallick, “Generating a Canonical Prefix Encoding”, Commun. ACM 7(3), pages 166−169, 1964), which can
 * by {@linkplain CanonicalFast64CodeWordDecoder quickly decoded using table lookups}.
 * The construction uses the most efficient one-pass in-place codelength computation procedure
 * described by Alistair Moffat and Jyrki Katajainen in “In-Place Calculation of Minimum-Redundancy Codes”,
 * Algorithms and Data Structures, 4th International Workshop,
 * number 955 in Lecture Notes in Computer Science, pages 393−402, Springer-Verlag, 1995.
 *
 * We note by passing that this coded uses a {@link CanonicalFast64CodeWordDecoder}, which does not support codelengths above 64.
 * However, since the worst case for codelengths is given by Fibonacci numbers, and frequencies are to be provided as integers,
 * no codeword longer than the base-[(5^1/2 + 1)/2] logarithm of 5^1/2 · 2³¹ (less than 47) will ever be generated. */

public class HuffmanCodec implements PrefixCodec, Serializable {
	private static final boolean DEBUG = false;
	private static final boolean ASSERTS = false;
	private static final long serialVersionUID = 2L;

	/** The number of symbols of this coder. */
	public final int size;
	/** The codewords for this coder. */
	private final BitVector[] codeWord;
	/** A cached singleton instance of the coder of this codec. */
	private final Fast64CodeWordCoder coder;
	/** A cached singleton instance of the decoder of this codec. */
	private final CanonicalFast64CodeWordDecoder decoder;

	private static long[] intArray2LongArray(final int a[]) {
		final long[] b = new long[a.length];
		for(int i = a.length; i-- != 0;) b[i] = a[i];
		return b;
	}

	/** Creates a new Huffman codec using the given vector of frequencies.
	 *
	 * @param frequency a vector of nonnnegative frequencies.
	 */
	public HuffmanCodec(final int[] frequency) {
		this(intArray2LongArray(frequency));
	}

	/** Creates a new Huffman codec using the given vector of frequencies.
	 *
	 * @param frequency a vector of nonnnegative frequencies.
	 */
	public HuffmanCodec(final long[] frequency) {
		size = frequency.length;

		if (size == 0 || size == 1) {
			codeWord = new BitVector[size];
			if (size == 1) codeWord[0] = LongArrayBitVector.getInstance();
			coder = new Fast64CodeWordCoder(codeWord, new long[size]);
			decoder = new CanonicalFast64CodeWordDecoder(new int[size], new int[size]);
			return;
        }

        final long[] a = new long[size];
        for(int i = size; i-- != 0;) a[i] = frequency[i];
        // Sort frequencies (this is the only n log n step).
        Arrays.sort(a);

        // The following lines are from Moffat & Katajainen sample code. Please refer to their paper.

        // First pass, left to right, setting parent pointers.
		a[0] += a[1];
		int root = 0;
		int leaf = 2;
		for (int next = 1; next < size - 1; next++) {
			// Select first item for a pairing.
			if (leaf >= size || a[root] < a[leaf]) {
				a[next] = a[root];
				a[root++] = next;
			}
			else a[next] = a[leaf++];

			// Add on the second item.
			if (leaf >= size || (root < next && a[root] < a[leaf])) {
				a[next] += a[root];
				a[root++] = next;
			}
			else a[next] += a[leaf++];
		}

		// Second pass, right to left, setting internal depths.
		a[size - 2] = 0;
		for (int next = size - 3; next >= 0; next--) a[next] = a[(int)a[next]] + 1;

		// Third pass, right to left, setting leaf depths.
		int available = 1, used = 0, depth = 0;
		root = size - 2;
		int next = size - 1;
		while (available > 0) {
			while (root >= 0 && a[root] == depth) {
				used++;
				root--;
			}
			while (available > used) {
				a[next--] = depth;
				available--;
			}
			available = 2 * used;
			depth++;
			used = 0;
		}

		// Reverse the order of symbol lengths, and store them into an int array.
		final int[] length = new int[size];
		for(int i = size; i-- != 0;) length[i] = (int)a[size - 1 - i];

		// Sort symbols indices by decreasing frequencies (so symbols correspond to lengths).
		final int[] symbol = new int[size];
		for(int i = size; i-- != 0;) symbol[i] = i;
		IntArrays.quickSort(symbol, 0, size, (x,y) -> Long.compare(frequency[y], frequency[x]));

		// Assign codewords (just for the coder--the decoder needs just the lengths).
		int s = symbol[0];
		int l = length[0];
		long value = 0;
		BitVector v;
		codeWord = new BitVector[size];
		final long[] longCodeWord = new long[size];
		codeWord[s] = LongArrayBitVector.getInstance().length(l);

		for(int i = 1; i < size; i++) {
			s = symbol[i];
			if (length[i] == l) value++;
			else {
				value++;
				value <<= length[i] - l;
				if (ASSERTS) assert length[i] > l;
				l = length[i];
			}
			v = LongArrayBitVector.getInstance().length(l);
			for(int j = l; j-- != 0;) if ((1L << j & value) != 0) v.set(l - 1 - j);
			codeWord[s] = v;
			longCodeWord[s] = value;
		}

		coder = new Fast64CodeWordCoder(codeWord, longCodeWord);
		decoder = new CanonicalFast64CodeWordDecoder(length, symbol);

		if (DEBUG) {
			final BitVector[] codeWord = codeWords();
			System.err.println("Codes: ");
			for(int i = 0; i < size; i++)
				System.err.println(i + " (" + codeWord[i].length() + " bits): " + codeWord[i]);

			long totFreq = 0;
			for(int i = size; i-- != 0;) totFreq += frequency[i];
			long totBits = 0;
			for(int i = size; i-- != 0;) totBits += frequency[i] * codeWord[i].length();
			System.err.println("Compression: " + totBits + " / " + totFreq * Character.SIZE + " = " + (double)totBits/(totFreq * Character.SIZE));
		}
}

	@Override
	public CodeWordCoder coder() {
		return coder;
	}

	@Override
	public Decoder decoder() {
		return decoder;
	}

	@Override
	public int size() {
		return size;
	}

	@Override
	public BitVector[] codeWords() {
		return coder.codeWords();
	}
}