src.it.unimi.dsi.compression.HuffmanCodec Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of dsiutils Show documentation
The DSI utilities are a mishmash of classes accumulated during the last twenty years in projects developed at the DSI (Dipartimento di Scienze dell'Informazione, i.e., Information Sciences Department), now DI (Dipartimento di Informatica, i.e., Informatics Department), of the Universita` degli Studi di Milano.
There is a newer version: 2.7.3
Show newest version
package it.unimi.dsi.compression;

import java.io.Serializable;
import java.util.Arrays;

/*
 * DSI utilities
 *
 * Copyright (C) 2005-2020 Sebastiano Vigna
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import it.unimi.dsi.bits.BitVector;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.fastutil.ints.IntArrays;

/** An implementation of Huffman optimal prefix-free coding.
 *
 * A Huffman coder is built starting from an array of frequencies corresponding to each
 * symbol. Frequency 0 symbols are allowed, but they will degrade the resulting code.
 *
 * 
Instances of this class compute a canonical Huffman code
 * (Eugene S. Schwartz and Bruce Kallick, “Generating a Canonical Prefix Encoding”, Commun. ACM 7(3), pages 166−169, 1964), which can
 * by {@linkplain CanonicalFast64CodeWordDecoder quickly decoded using table lookups}.
 * The construction uses the most efficient one-pass in-place codelength computation procedure
 * described by Alistair Moffat and Jyrki Katajainen in “In-Place Calculation of Minimum-Redundancy Codes”,
 * Algorithms and Data Structures, 4th International Workshop,
 * number 955 in Lecture Notes in Computer Science, pages 393−402, Springer-Verlag, 1995.
 *
 * We note by passing that this coded uses a {@link CanonicalFast64CodeWordDecoder}, which does not support codelengths above 64.
 * However, since the worst case for codelengths is given by Fibonacci numbers, and frequencies are to be provided as integers,
 * no codeword longer than the base-[(5^1/2 + 1)/2] logarithm of 5^1/2 · 2³¹ (less than 47) will ever be generated. */

public class HuffmanCodec implements PrefixCodec, Serializable {
	private static final boolean DEBUG = false;
	private static final boolean ASSERTS = false;
	private static final long serialVersionUID = 2L;

	/** The number of symbols of this coder. */
	public final int size;
	/** The codewords for this coder. */
	private final BitVector[] codeWord;
	/** A cached singleton instance of the coder of this codec. */
	private final Fast64CodeWordCoder coder;
	/** A cached singleton instance of the decoder of this codec. */
	private final CanonicalFast64CodeWordDecoder decoder;

	private static long[] intArray2LongArray(final int a[]) {
		final long[] b = new long[a.length];
		for(int i = a.length; i-- != 0;) b[i] = a[i];
		return b;
	}

	/** Creates a new Huffman codec using the given vector of frequencies.
	 *
	 * @param frequency a vector of nonnnegative frequencies.
	 */
	public HuffmanCodec(final int[] frequency) {
		this(intArray2LongArray(frequency));
	}

	/** Creates a new Huffman codec using the given vector of frequencies.
	 *
	 * @param frequency a vector of nonnnegative frequencies.
	 */
	public HuffmanCodec(final long[] frequency) {
		size = frequency.length;

		if (size == 0 || size == 1) {
			codeWord = new BitVector[size];
			if (size == 1) codeWord[0] = LongArrayBitVector.getInstance();
			coder = new Fast64CodeWordCoder(codeWord, new long[size]);
			decoder = new CanonicalFast64CodeWordDecoder(new int[size], new int[size]);
			return;
        }

        final long[] a = new long[size];
        for(int i = size; i-- != 0;) a[i] = frequency[i];
        // Sort frequencies (this is the only n log n step).
        Arrays.sort(a);

        // The following lines are from Moffat & Katajainen sample code. Please refer to their paper.

        // First pass, left to right, setting parent pointers.
		a[0] += a[1];
		int root = 0;
		int leaf = 2;
		for (int next = 1; next < size - 1; next++) {
			// Select first item for a pairing.
			if (leaf >= size || a[root] < a[leaf]) {
				a[next] = a[root];
				a[root++] = next;
			}
			else a[next] = a[leaf++];

			// Add on the second item.
			if (leaf >= size || (root < next && a[root] < a[leaf])) {
				a[next] += a[root];
				a[root++] = next;
			}
			else a[next] += a[leaf++];
		}

		// Second pass, right to left, setting internal depths.
		a[size - 2] = 0;
		for (int next = size - 3; next >= 0; next--) a[next] = a[(int)a[next]] + 1;

		// Third pass, right to left, setting leaf depths.
		int available = 1, used = 0, depth = 0;
		root = size - 2;
		int next = size - 1;
		while (available > 0) {
			while (root >= 0 && a[root] == depth) {
				used++;
				root--;
			}
			while (available > used) {
				a[next--] = depth;
				available--;
			}
			available = 2 * used;
			depth++;
			used = 0;
		}

		// Reverse the order of symbol lengths, and store them into an int array.
		final int[] length = new int[size];
		for(int i = size; i-- != 0;) length[i] = (int)a[size - 1 - i];

		// Sort symbols indices by decreasing frequencies (so symbols correspond to lengths).
		final int[] symbol = new int[size];
		for(int i = size; i-- != 0;) symbol[i] = i;
		IntArrays.quickSort(symbol, 0, size, (x,y) -> Long.compare(frequency[y], frequency[x]));

		// Assign codewords (just for the coder--the decoder needs just the lengths).
		int s = symbol[0];
		int l = length[0];
		long value = 0;
		BitVector v;
		codeWord = new BitVector[size];
		final long[] longCodeWord = new long[size];
		codeWord[s] = LongArrayBitVector.getInstance().length(l);

		for(int i = 1; i < size; i++) {
			s = symbol[i];
			if (length[i] == l) value++;
			else {
				value++;
				value <<= length[i] - l;
				if (ASSERTS) assert length[i] > l;
				l = length[i];
			}
			v = LongArrayBitVector.getInstance().length(l);
			for(int j = l; j-- != 0;) if ((1L << j & value) != 0) v.set(l - 1 - j);
			codeWord[s] = v;
			longCodeWord[s] = value;
		}

		coder = new Fast64CodeWordCoder(codeWord, longCodeWord);
		decoder = new CanonicalFast64CodeWordDecoder(length, symbol);

		if (DEBUG) {
			final BitVector[] codeWord = codeWords();
			System.err.println("Codes: ");
			for(int i = 0; i < size; i++)
				System.err.println(i + " (" + codeWord[i].length() + " bits): " + codeWord[i]);

			long totFreq = 0;
			for(int i = size; i-- != 0;) totFreq += frequency[i];
			long totBits = 0;
			for(int i = size; i-- != 0;) totBits += frequency[i] * codeWord[i].length();
			System.err.println("Compression: " + totBits + " / " + totFreq * Character.SIZE + " = " + (double)totBits/(totFreq * Character.SIZE));
		}
}

	@Override
	public CodeWordCoder coder() {
		return coder;
	}

	@Override
	public Decoder decoder() {
		return decoder;
	}

	@Override
	public int size() {
		return size;
	}

	@Override
	public BitVector[] codeWords() {
		return coder.codeWords();
	}
}