src.it.unimi.dsi.bits.HuTuckerTransformationStrategy Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of dsiutils Show documentation

The DSI utilities are a mishmash of classes accumulated during the last twenty years in projects developed at the DSI (Dipartimento di Scienze dell'Informazione, i.e., Information Sciences Department), now DI (Dipartimento di Informatica, i.e., Informatics Department), of the Universita` degli Studi di Milano.

There is a newer version: 2.7.3

Show newest version

package it.unimi.dsi.bits;

/*
 * DSI utilities
 *
 * Copyright (C) 2007-2020 Sebastiano Vigna
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import it.unimi.dsi.compression.HuTuckerCodec;
import it.unimi.dsi.fastutil.chars.Char2IntMap;
import it.unimi.dsi.fastutil.chars.Char2IntOpenHashMap;

import java.util.Iterator;

/** A transformation strategy mapping strings to their {@linkplain HuTuckerCodec Hu-Tucker encoding}. The
 * encoding is guaranteed to preserve lexicographical ordering.
 */

public class HuTuckerTransformationStrategy extends PrefixCoderTransformationStrategy {
	private static final long serialVersionUID = 1;
	/** Creates a Hu-Tucker transformation strategy for the character sequences returned by the given iterable. The
	 * strategy will map a string to its Hu-Tucker encoding.
	 *
	 * @param iterable an iterable object returning character sequences.
	 * @param prefixFree if true, the resulting set of binary words will be prefix free.
	 */
	public HuTuckerTransformationStrategy(final Iterable iterable, final boolean prefixFree) {
		this(getCoder(iterable, prefixFree), prefixFree);
	}

	protected HuTuckerTransformationStrategy(PrefixCoderTransformationStrategy huTuckerTransformationStrategy) {
		super(huTuckerTransformationStrategy);
	}

	protected HuTuckerTransformationStrategy(Object[] a, boolean prefixFree) {
		super((BitVector[])a[0], (Char2IntOpenHashMap)a[1], prefixFree);
	}

	private static Object[] getCoder(final Iterable iterable, boolean prefixFree) {
		// First of all, we gather frequencies for all Unicode characters
		long[] frequency = new long[Character.MAX_VALUE + 1];
		int maxWordLength = 0;
		CharSequence s;
		int n = 0;

		for(Iterator i = iterable.iterator(); i.hasNext();) {
			s = i.next();
			maxWordLength = Math.max(s.length(), maxWordLength);
			for(int j = s.length(); j-- != 0;) frequency[s.charAt(j)]++;
			n++;
		}

		// Then, we compute the number of actually used characters. We count from the start the stop character.
		int count = prefixFree ? 1 : 0;
		for(int i = frequency.length; i-- != 0;) if (frequency[i] != 0) count++;

		/* Now we remap used characters in f, building at the same time the map from characters to symbols (except for the stop character). */
		long[] packedFrequency = new long[count];
		final Char2IntMap char2symbol = new Char2IntOpenHashMap(count);

		for(int i = frequency.length, k = count; i-- != 0;) {
			if (frequency[i] != 0) {
				packedFrequency[--k] = frequency[i];
				char2symbol.put((char)i, k);
			}
		}

		if (prefixFree) packedFrequency[0] = n; // The stop character appears once in each string.

		// We now build the coder used to code the strings
		return new Object[] { new HuTuckerCodec(packedFrequency).coder().codeWords(), char2symbol };
	}

	@Override
	public PrefixCoderTransformationStrategy copy() {
		return new HuTuckerTransformationStrategy(this);
	}
}