src.it.unimi.dsi.sux4j.mph.HollowTrieDistributorMonotoneMinimalPerfectHashFunction Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of sux4j Show documentation
Show all versions of sux4j Show documentation
Sux4j is an implementation of succinct data structure in Java. It provides a number of related implementations covering ranking/selection over bit arrays, compressed lists and minimal perfect hashing.
package it.unimi.dsi.sux4j.mph;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.nio.charset.Charset;
import java.util.Collection;
import java.util.zip.GZIPInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import com.martiansoftware.jsap.stringparsers.FileStringParser;
import com.martiansoftware.jsap.stringparsers.ForNameStringParser;
/*
* Sux4J: Succinct data structures for Java
*
* Copyright (C) 2008-2019 Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see .
*
*/
import it.unimi.dsi.bits.BitVector;
import it.unimi.dsi.bits.Fast;
import it.unimi.dsi.bits.HuTuckerTransformationStrategy;
import it.unimi.dsi.bits.TransformationStrategies;
import it.unimi.dsi.bits.TransformationStrategy;
import it.unimi.dsi.fastutil.Size64;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.longs.AbstractLongBigList;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.FileLinesCollection;
import it.unimi.dsi.io.LineIterator;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.logging.ProgressLogger;
/** A monotone minimal perfect hash implementation based on fixed-size bucketing that uses
* a {@linkplain HollowTrieDistributor hollow trie} as a distributor.
*
*/
public class HollowTrieDistributorMonotoneMinimalPerfectHashFunction extends AbstractHashFunction implements Size64, Serializable {
public static final long serialVersionUID = 5L;
private static final Logger LOGGER = LoggerFactory.getLogger(HollowTrieDistributorMonotoneMinimalPerfectHashFunction.class);
/** The number of elements. */
private final long size;
/** The size of a bucket. */
private final int bucketSize;
/** {@link Fast#ceilLog2(int)} of {@link #bucketSize}. */
private final int log2BucketSize;
/** The transformation strategy. */
private final TransformationStrategy super T> transform;
/** A hollow trie distributor assigning keys to buckets. */
private final HollowTrieDistributor distributor;
/** The offset of each element into his bucket. */
private final GOV3Function offset;
@Override
@SuppressWarnings("unchecked")
public long getLong(final Object o) {
if (size <= 1) return defRetValue;
final BitVector bv = transform.toBitVector((T)o).fast();
final long bucket = distributor.getLong(bv);
// TODO: could use offset's return value to return defRetValue.
return (bucket << log2BucketSize) + offset.getLong(bv);
}
/** Creates a new hollow-trie-based monotone minimal perfect hash function using the given
* elements and transformation strategy, using the default temporary directory.
*
* @param elements the elements among which the trie must be able to rank.
* @param transform a transformation strategy that must turn the elements in elements
into a list of
* distinct, prefix-free, lexicographically increasing (in iteration order) bit vectors.
*/
public HollowTrieDistributorMonotoneMinimalPerfectHashFunction(final Iterable extends T> elements, final TransformationStrategy super T> transform) throws IOException {
this(elements, transform, null);
}
/** Creates a new hollow-trie-based monotone minimal perfect hash function using the given
* elements, transformation strategy, and temporary directory.
*
* @param elements the elements among which the trie must be able to rank.
* @param transform a transformation strategy that must turn the elements in elements
into a list of
* distinct, prefix-free, lexicographically increasing (in iteration order) bit vectors.
* @param tempDir a directory for the temporary files created during construction
* by the {@link HollowTrieDistributor}, or {@code null} for the default temporary directory.
*/
public HollowTrieDistributorMonotoneMinimalPerfectHashFunction(final Iterable extends T> elements, final TransformationStrategy super T> transform, File tempDir) throws IOException {
this.transform = transform;
long maxLength = 0;
long totalLength = 0;
long c = 0;
BitVector bv;
for(final T s: elements) {
bv = transform.toBitVector(s);
maxLength = Math.max(maxLength, bv.length());
totalLength += bv.length();
c++;
}
size = c;
defRetValue = size == 1 ? 0 : -1; // For the very few cases in which we can decide
if (size <= 1) {
bucketSize = log2BucketSize = 0;
distributor = null;
offset = null;
return;
}
final long averageLength = (totalLength + size - 1) / size;
// The distributor cannot contain just one string
final int l = Fast.ceilLog2(Math.round((long)((Math.log(averageLength) + 2) * Math.log(2) / GOV3Function.C)));
log2BucketSize = size / (1 << l) <= 1 ? 0 : l;
bucketSize = 1 << log2BucketSize;
final int bucketMask = bucketSize - 1;
LOGGER.debug("Bucket size: " + bucketSize);
final Iterable bitVectors = TransformationStrategies.wrap(elements, transform);
distributor = new HollowTrieDistributor<>(bitVectors, log2BucketSize, TransformationStrategies.identity(), tempDir);
offset = new GOV3Function.Builder().keys(bitVectors).transform(TransformationStrategies.identity()).values(new AbstractLongBigList() {
@Override
public long getLong(long index) {
return index & bucketMask;
}
@Override
public long size64() {
return size;
}
}, log2BucketSize).build();
LOGGER.debug("Forecast bit cost per element: " + (GOV3Function.C * (1 / Math.log(2) + 2 + Fast.log2(Math.log(2) / GOV3Function.C)) + Fast.log2(2 + Fast.log2(averageLength + 1))));
LOGGER.info("Actual bit cost per element: " + (double)numBits() / size);
}
@Override
public long size64() {
return size;
}
public long numBits() {
return distributor.numBits() + offset.numBits() + transform.numBits();
}
public static void main(final String[] arg) throws NoSuchMethodException, IOException, JSAPException {
final SimpleJSAP jsap = new SimpleJSAP(HollowTrieDistributorMonotoneMinimalPerfectHashFunction.class.getName(), "Builds a monotone minimal perfect hash using a hollow trie as a distributor reading a newline-separated list of strings.",
new Parameter[] {
new FlaggedOption("encoding", ForNameStringParser.getParser(Charset.class), "UTF-8", JSAP.NOT_REQUIRED, 'e', "encoding", "The string file encoding."),
new Switch("huTucker", 'h', "hu-tucker", "Use Hu-Tucker coding to reduce string length."),
new Switch("iso", 'i', "iso", "Use ISO-8859-1 coding internally (i.e., just use the lower eight bits of each character)."),
new Switch("utf32", JSAP.NO_SHORTFLAG, "utf-32", "Use UTF-32 internally (handles surrogate pairs)."),
new Switch("zipped", 'z', "zipped", "The string list is compressed in gzip format."),
new FlaggedOption("tempDir", FileStringParser.getParser(), JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 't', "temp-dir", "A temporary directory for the files created during the construction."),
new UnflaggedOption("function", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the serialised monotone minimal perfect hash function."),
new UnflaggedOption("stringFile", JSAP.STRING_PARSER, "-", JSAP.NOT_REQUIRED, JSAP.NOT_GREEDY, "The name of a file containing a newline-separated list of strings, or - for standard input; in the first case, strings will not be loaded into core memory."),
});
final JSAPResult jsapResult = jsap.parse(arg);
if (jsap.messagePrinted()) return;
final String functionName = jsapResult.getString("function");
final String stringFile = jsapResult.getString("stringFile");
final Charset encoding = (Charset)jsapResult.getObject("encoding");
final boolean zipped = jsapResult.getBoolean("zipped");
final boolean iso = jsapResult.getBoolean("iso");
final boolean utf32 = jsapResult.getBoolean("utf32");
final boolean huTucker = jsapResult.getBoolean("huTucker");
final File tempDir = jsapResult.getFile("tempDir");
final Collection collection;
if ("-".equals(stringFile)) {
final ProgressLogger pl = new ProgressLogger(LOGGER);
pl.displayLocalSpeed = true;
pl.displayFreeMemory = true;
pl.start("Loading strings...");
collection = new LineIterator(new FastBufferedReader(new InputStreamReader(zipped ? new GZIPInputStream(System.in) : System.in, encoding)), pl).allLines();
pl.done();
}
else collection = new FileLinesCollection(stringFile, encoding.toString(), zipped);
final TransformationStrategy transformationStrategy = huTucker
? new HuTuckerTransformationStrategy(collection, true)
: iso
? TransformationStrategies.prefixFreeIso()
: utf32
? TransformationStrategies.prefixFreeUtf32()
: TransformationStrategies.prefixFreeUtf16();
BinIO.storeObject(new HollowTrieDistributorMonotoneMinimalPerfectHashFunction(collection, transformationStrategy, tempDir), functionName);
LOGGER.info("Completed.");
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy