All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.it.unimi.dsi.big.util.LongBigListSignedStringMap Maven / Gradle / Ivy

package it.unimi.dsi.big.util;

/*
 * DSI utilities
 *
 * Copyright (C) 2013-2017 Sebastiano Vigna
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */


import it.unimi.dsi.fastutil.Function;
import it.unimi.dsi.fastutil.Size64;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.fastutil.longs.LongBigArrays;
import it.unimi.dsi.fastutil.longs.LongBigList;
import it.unimi.dsi.fastutil.objects.AbstractObject2LongFunction;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import it.unimi.dsi.fastutil.objects.ObjectBigList;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.LineIterator;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.util.ByteBufferLongBigList;

import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.zip.GZIPInputStream;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import com.martiansoftware.jsap.stringparsers.ForNameStringParser;

/** A string map based on a function signed using a big list of longs.
 *
 * 

The purpose of this map is identical to that of a {@link ShiftAddXorSignedStringMap}, but * Shift-Add-Xor signatures are 64-bit and stored in a {@link LongBigList}. This approach makes it possible to * store the signatures in a file and read them by memory mapping using a {@link ByteBufferLongBigList}. * If the map has a very large number of keys but the access pattern is strongly skewed towards a relatively * small number of entries, using memory mapping might be advantageous. * *

The intended usage pattern is as follows: * *

    * *
  • first, you {@linkplain #sign(Iterator, Object2LongFunction) generate a file of signatures} (note that * for this phase it might be necessary to keep the signatures in main memory; see {@link #sign(Iterator, String)} for some elaboration); * *
  • then, when you want to use the signed map you map the file using {@link ByteBufferLongBigList#map(java.nio.channels.FileChannel)} * and {@linkplain #LongBigListSignedStringMap(Object2LongFunction, LongBigList) create on the fly a signed map}. * *
* *

To simplify the process, there is a {@linkplain #LongBigListSignedStringMap(Object2LongFunction, String) constructor} that will do the mapping for you. * * @author Sebastiano Vigna * @since 2.0.13 * @deprecated There are much better and faster hash functions. */ @Deprecated public class LongBigListSignedStringMap extends AbstractObject2LongFunction implements StringMap, Serializable { private static final long serialVersionUID = 0L; /** The underlying map. */ protected final Object2LongFunction function; /** Signatures. */ protected final LongBigList signatures; /** Generates a 64-bit signatures big array using a given function and string sequence. * *

The resulting big array can be saved using {@link BinIO#storeLongs(long[][], CharSequence)} * or similar {@link BinIO} methods. * * @param iterator an iterator over a list of strings. * @param function the function to be signed. * @return a big array of 64-bit signatures. */ public static long[][] sign(final Iterator iterator, final Object2LongFunction function) { return sign(iterator, function, null); } /** Generates a 64-bit signatures big array using a given function and string sequence. * *

The resulting big array can be saved using {@link BinIO#storeLongs(long[][], CharSequence)} * or similar {@link BinIO} methods. * * @param iterator an iterator over a list of strings. * @param function the function to be signed. * @param pl a progress logger, or {@code null}. * @return a big array of 64-bit signatures. */ public static long[][] sign(final Iterator iterator, final Object2LongFunction function , final ProgressLogger pl) { final long n = function instanceof Size64 ? ((Size64)function).size64() : function .size(); final long[][] signature = LongBigArrays.newBigArray(n); if (pl != null) { pl.expectedUpdates = n; pl.start("Signing..."); } CharSequence s; for(long i = 0; i < n; i++) { s = iterator.next(); LongBigArrays.set(signature, function .getLong(s), signature(s)); if (pl != null) pl.lightUpdate(); } if (iterator.hasNext()) throw new IllegalStateException("Iterator provides more than " + n + " elements"); if (pl != null) pl.done(); return signature; } /** Creates a signed string map using a given hash map and a big list of 64-bit signatures. * * @param function a minimal perfect hash for the strings enumerated by iterator; it must support {@link Function#size() size()}. * and have default return value -1. * @param signatures a big list of 64-bit signatures generated by {@link #sign(Iterator, Object2LongFunction, ProgressLogger)}. */ public LongBigListSignedStringMap(final Object2LongFunction function, LongBigList signatures) { final long n = function instanceof Size64 ? ((Size64)function).size64() : function.size(); if (n != signatures.size64()) throw new IllegalStateException("The size of the function differs from that of the signature list: " + n + " != " + signatures.size64()); this.function = function; this.signatures = signatures; defaultReturnValue(-1); } /** Generates an on-disk list 64-bit signatures big array using a given string sequence. * *

This methods generates on-disk signatures in the same order of the strings returned by * the provided iterator. Thus, the signature file can be only used with a function that * maps each string returned by the iterator in its ordinal position. This happens, for instance, * if you have a sorted set of string and you use a monotone minimal perfect hash function. * * @param iterator an iterator over a list of strings. * @param signatureFile the file name of the resulting signature file. */ public static void sign(final Iterator iterator, final String signatureFile) throws IOException { sign(iterator, signatureFile, null); } /** Generates an on-disk list 64-bit signatures big array using a given string sequence. * * @param iterator an iterator over a list of strings. * @param signatureFile the file name of the resulting signature file. * @param pl a progress logger, or {@code null}. * @see #sign(Iterator, DataOutput, ProgressLogger) */ public static void sign(final Iterator iterator, final String signatureFile, final ProgressLogger pl) throws IOException { final DataOutputStream dos = new DataOutputStream(new FastBufferedOutputStream(new FileOutputStream(signatureFile))); sign(iterator, dos, pl); dos.close(); } /** Generates an on-disk list 64-bit signatures big array using a given string sequence. * *

This methods generates on-disk signatures in the same order of the strings returned by * the provided iterator. Thus, the generated signatures can be only used with a function that * maps each string returned by the iterator in its ordinal position. This happens, for instance, * if you have a sorted set of string and you use a * monotone minimal perfect hash function. * * @param iterator an iterator over a list of strings. * @param signatures a {@link DataOutput} where the signatures will be written. * @param pl a progress logger, or {@code null}. */ public static void sign(final Iterator iterator, final DataOutput signatures, final ProgressLogger pl) throws IOException { if (pl != null) pl.start("Signing..."); CharSequence s; while(iterator.hasNext()) { s = iterator.next(); signatures.writeLong(signature(s)); if (pl != null) pl.lightUpdate(); } if (pl != null) pl.done(); } /** Creates a signed string map using a given hash map and a big list of 64-bit signatures. * * @param function a minimal perfect hash for the strings enumerated by iterator; it must support {@link Function#size() size()}. * and have default return value -1. * @param signatures a file containing a list of 64-bit signatures * generated by {@link #sign(Iterator, Object2LongFunction, ProgressLogger)} and stored in {@link DataOutput} format. */ @SuppressWarnings("resource") public LongBigListSignedStringMap(final Object2LongFunction function, String signatures) throws FileNotFoundException, IOException { final long n = function instanceof Size64 ? ((Size64)function).size64() : function.size(); final long signatureSize = new File(signatures).length() / (Long.SIZE / Byte.SIZE); if (n != signatureSize) throw new IllegalStateException("The size of the function differs from that of the signature list: " + n + " != " + signatureSize); this.function = function; this.signatures = ByteBufferLongBigList.map(new FileInputStream(signatures).getChannel()); defaultReturnValue(-1); } private static long signature(final CharSequence s) { int i, l = s.length(); long h = 42; for (i = l; i-- != 0;) h ^= (h << 5) + s.charAt(i) + (h >>> 2); return h; } private boolean checkSignature(final CharSequence s, final long index) { return index >= 0 && index < function.size() && signatures.getLong(index) == signature(s); } @Override public long getLong(Object o) { final CharSequence s = (CharSequence)o; final long index = function.getLong(s); return checkSignature(s, index) ? index : defRetValue; } @Override public Long get(Object o) { final CharSequence s = (CharSequence)o; final long index = function.getLong(s); return checkSignature(s, index) ? Long.valueOf(index) : null; } @Override public boolean containsKey(Object o) { final CharSequence s = (CharSequence)o; return checkSignature(s, function.getLong(s)); } @Override @Deprecated public int size() { return signatures.size(); } @Override public long size64() { return signatures.size64(); } @Override public ObjectBigList list() { return null; } @SuppressWarnings("unchecked") public static void main(final String[] arg) throws NoSuchMethodException, IOException, JSAPException, ClassNotFoundException { final SimpleJSAP jsap = new SimpleJSAP(LongBigListSignedStringMap.class.getName(), "Generates a 64-bit signature file by reading a newline-separated list of strings and a function built on the same list of strings.", new Parameter[] { new FlaggedOption("bufferSize", JSAP.INTSIZE_PARSER, "64Ki", JSAP.NOT_REQUIRED, 'b', "buffer-size", "The size of the I/O buffer used to read strings."), new FlaggedOption("encoding", ForNameStringParser.getParser(Charset.class), "UTF-8", JSAP.NOT_REQUIRED, 'e', "encoding", "The string file encoding."), new Switch("zipped", 'z', "zipped", "The string list is compressed in gzip format."), new UnflaggedOption("function", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename of the function to be signed, or - for writing signatures in the same order of the strings."), new UnflaggedOption("signatures", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename of the resulting signatures."), new UnflaggedOption("stringFile", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, JSAP.NOT_GREEDY, "Read strings from this file instead of standard input."), }); JSAPResult jsapResult = jsap.parse(arg); if (jsap.messagePrinted()) return; final int bufferSize = jsapResult.getInt("bufferSize"); final String functionName = jsapResult.getString("function"); final String signaturesName = jsapResult.getString("signatures"); final String stringFile = jsapResult.getString("stringFile"); final Charset encoding = (Charset)jsapResult.getObject("encoding"); final boolean zipped = jsapResult.getBoolean("zipped"); final InputStream inputStream = stringFile != null ? new FileInputStream(stringFile) : System.in; final Iterator iterator = new LineIterator(new FastBufferedReader(new InputStreamReader(zipped ? new GZIPInputStream(inputStream) : inputStream, encoding), bufferSize)); final Object2LongFunction function = "-".equals(functionName) ? null : (Object2LongFunction)BinIO.loadObject(functionName); final Logger logger = LoggerFactory.getLogger(LongBigListSignedStringMap.class); final ProgressLogger pl = new ProgressLogger(logger); if (function != null) BinIO.storeLongs(sign(iterator, function, pl), signaturesName); else sign(iterator, signaturesName, pl); if (stringFile != null) inputStream.close(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy