All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.it.unimi.dsi.util.ImmutableExternalPrefixMap Maven / Gradle / Ivy

package it.unimi.dsi.util;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.zip.GZIPInputStream;

import org.apache.commons.io.IOUtils;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import com.martiansoftware.jsap.stringparsers.ForNameStringParser;

/*
 * DSI utilities
 *
 * Copyright (C) 2005-2017 Sebastiano Vigna
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import it.unimi.dsi.bits.BitVector;
import it.unimi.dsi.bits.PrefixCoderTransformationStrategy;
import it.unimi.dsi.compression.Decoder;
import it.unimi.dsi.compression.HuTuckerCodec;
import it.unimi.dsi.compression.PrefixCodec;
import it.unimi.dsi.compression.PrefixCoder;
import it.unimi.dsi.fastutil.booleans.BooleanIterator;
import it.unimi.dsi.fastutil.chars.Char2IntOpenHashMap;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.objects.Object2IntLinkedOpenHashMap;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.ObjectIterator;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.FileLinesCollection;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.lang.MutableString;

// TODO: implement interfaces correctly (e.g., using the iterator)

/** An immutable prefix map mostly stored in external memory.
 *
 * 

Instances of this class write on a dump stream the string in the domain * of the prefix map. More precisely, the dump stream is formed by blocks, and each * block (with user-definable length, possibly the size of a basic disk I/O operation) * is filled as much as possible with strings front coded and compressed with a * {@link it.unimi.dsi.compression.HuTuckerCodec}. * Each block starts with the length of the first string in unary, followed by the encoding of the * string. Then, for each string we write in unary the length of the common prefix (in characters) * with the previous string, the length of the remaining suffix (in characters) * and finally the encoded suffix. Note that if the encoding of a string is longer than a block, * the string will occupy more than one block. * *

We keep track using an {@link ImmutableBinaryTrie} * of the strings at the start of each block in {@link HuTuckerCodec Hu–Tucker coding}: * thus, we are able to retrieve the interval corresponding * to a given prefix by calling {@link ImmutableBinaryTrie#getApproximatedInterval(BooleanIterator) getApproximatedInterval()} * and scanning at most two blocks. * *

Self-contained or non-self-contained

* *

There are two kinds of external prefix maps: self-contained and non-self-contained. * In the first case, you get a serialised object that you can load at any time. The dump * stream is serialised with the object and expanded at each deserialisation in the Java temporary directory. * If you deserialise a map several times, you will get correspondingly many copies of * the dump stream in the temporary directory. The dump streams are deleted when the JVM * exits. This mechanism is not very efficient, but since this class implements several * interfaces it is essential that clients can make things work in a standard way. * *

Alternatively, you can give at creation time a filename for the dump stream. * The resulting non-self-contained external prefix map * can be serialised, but after deserialisation * you need to set the {@linkplain #setDumpStream(CharSequence) dump stream filename} * or even directly the {@linkplain #setDumpStream(InputBitStream) dump stream} (for instance, to * an {@linkplain it.unimi.dsi.io.InputBitStream#InputBitStream(byte[]) input bit stream * wrapping a byte array where the dump stream has been loaded}). You can deserialise many * copies of an external prefix map, letting all copies share the same dump stream. * *

This data structure is not synchronised, and concurrent reads may cause problems * because of clashes in the usage of the underlying input bit stream. It would not * be a good idea in any case to open a new stream for each caller, as that would * certainly lead to disk thrashing. * *

The {@linkplain #main(String[]) main method} of this class * helps in building large external prefix maps. * * @author Sebastiano Vigna * @since 0.9.3 */ public class ImmutableExternalPrefixMap extends AbstractPrefixMap implements Serializable { private final static boolean DEBUG = false; private final static boolean ASSERTS = false; public static final long serialVersionUID = 1L; /** The standard block size (in bytes). */ public final static int STD_BLOCK_SIZE = 1024; /** The maximum number of entry in the cache map. */ public static final int CACHE_MAX_SIZE = 1024; /** The in-memory data structure used to approximate intervals.. */ final protected ImmutableBinaryTrie intervalApproximator; /** The block size of this (in bits). */ final protected long blockSize; /** A decoder used to read data from the dump stream. */ final protected Decoder decoder; /** A map (given by an array) from symbols in the coder to characters. */ final protected char[] symbol2char; /** A map from characters to symbols of the coder. */ final protected Char2IntOpenHashMap char2symbol; /** The number of terms in this map. */ final protected int size; /** The index of the first word in each block, plus an additional entry containing {@link #size}. */ final protected int[] blockStart; /** An array parallel to {@link #blockStart} giving the offset in blocks in the dump file * of the corresponding word in {@link #blockStart}. If there are no overflows, this will just * be an initial segment of the natural numbers, but overflows cause jumps. */ final protected int[] blockOffset; /** Whether this map is self-contained. */ final protected boolean selfContained; /** The length in bytes of the dump stream, both for serialisation purposes and for minimal checks. */ private final long dumpStreamLength; /** The filename of the temporary dump stream, or of the dump stream created by the constructor or by readObject(). */ private transient String tempDumpStreamFilename; /** If true, the creation of the last DumpStreamIterator was not * followed by a call to any get method. */ protected transient boolean iteratorIsUsable; /** A reference to the dump stream. */ protected transient InputBitStream dumpStream; /** A cache for the most recent queries */ protected transient Object2IntLinkedOpenHashMap cache; /** Creates an external prefix map with specified block size and dump stream. * *

This constructor does not assume that {@link CharSequence} instances returned by terms.iterator() * will be distinct. Thus, it can be safely used with {@link FileLinesCollection}. * * @param terms an iterable whose iterator will enumerate in lexicographical order the terms for the map. * @param blockSizeInBytes the block size (in bytes). * @param dumpStreamFilename the name of the dump stream, or {@code null} for a self-contained map. */ public ImmutableExternalPrefixMap(final Iterable terms, final int blockSizeInBytes, final CharSequence dumpStreamFilename) throws IOException { this.blockSize = blockSizeInBytes * 8; this.selfContained = dumpStreamFilename == null; // First of all, we gather frequencies for all Unicode characters int[] frequency = new int[Character.MAX_VALUE + 1]; int maxWordLength = 0; CharSequence s; int count = 0; final MutableString prevTerm = new MutableString(); for(Iterator i = terms.iterator(); i.hasNext();) { s = i.next(); maxWordLength = Math.max(s.length(), maxWordLength); for(int j = s.length(); j-- != 0;) frequency[s.charAt(j)]++; final int cmp = prevTerm.compareTo(s); if (count > 0 && cmp >= 0) throw new IllegalArgumentException("The provided term collection " + (cmp == 0 ? "contains duplicates" : "is not sorted") + " [" + prevTerm + ", " + s + "]"); count++; prevTerm.replace(s); } size = count; // Then, we compute the number of actually used characters count = 0; for(int i = frequency.length; i-- != 0;) if (frequency[i] != 0) count++; /* Now we remap used characters in f, building at the same time maps from * symbol to characters and from characters to symbols. */ int[] packedFrequency = new int[count]; symbol2char = new char[count]; char2symbol = new Char2IntOpenHashMap(count); char2symbol.defaultReturnValue(-1); for(int i = frequency.length, k = count; i-- != 0;) { if (frequency[i] != 0) { packedFrequency[--k] = frequency[i]; symbol2char[k] = (char)i; char2symbol.put((char)i, k); } } char2symbol.trim(); // We now build the coder used to code the strings final PrefixCoder prefixCoder; final PrefixCodec codec; final BitVector[] codeWord; if (packedFrequency.length != 0) { codec = new HuTuckerCodec(packedFrequency); prefixCoder = codec.coder(); decoder = codec.decoder(); codeWord = prefixCoder.codeWords(); } else { // This handles the case of a collection without words codec = null; prefixCoder = null; decoder = null; codeWord = null; } packedFrequency = frequency = null; // We now compress all strings using the given codec mixed with front coding final OutputBitStream output; if (selfContained) { final File temp = File.createTempFile(this.getClass().getName(), ".dump"); temp.deleteOnExit(); tempDumpStreamFilename = temp.toString(); output = new OutputBitStream(temp, blockSizeInBytes); } else output = new OutputBitStream(tempDumpStreamFilename = dumpStreamFilename.toString(), blockSizeInBytes); // This array will contain the delimiting words (the ones at the start of each block) boolean isDelimiter; int length, prevTermLength = 0, bits; int prefixLength = 0, termCount = 0; int currBuffer = 0; final IntArrayList blockStarts = new IntArrayList(); final IntArrayList blockOffsets = new IntArrayList(); final ObjectArrayList delimiters = new ObjectArrayList<>(); prevTerm.length(0); for(Iterator i = terms.iterator(); i.hasNext();) { s = (CharSequence) i.next(); length = s.length(); isDelimiter = false; // We compute the common prefix and the number of bits that are necessary to code the next term. bits = 0; for(prefixLength = 0; prefixLength < length && prefixLength < prevTermLength && prevTerm.charAt(prefixLength) == s.charAt(prefixLength); prefixLength++); for(int j = prefixLength; j < length; j++) bits += codeWord[char2symbol.get(s.charAt(j))].length(); //if (bits + length + 1 > blockSize) throw new IllegalArgumentException("The string \"" + s + "\" is too long to be encoded with block size " + blockSizeInBytes); // If the next term would overflow the block, and we are not at the start of a block, we align. if (output.writtenBits() % blockSize != 0 && output.writtenBits() / blockSize != (output.writtenBits() + (length - prefixLength + 1) + (prefixLength + 1) + bits - 1) / blockSize) { // We align by writing 0es. if (DEBUG) System.err.println("Aligning away " + (blockSize - output.writtenBits() % blockSize) + " bits..."); for(int j = (int)(blockSize - output.writtenBits() % blockSize); j-- != 0;) output.writeBit(0); if (ASSERTS) assert output.writtenBits() % blockSize == 0; } if (output.writtenBits() % blockSize == 0) { isDelimiter = true; prefixLength = 0; blockOffsets.add((int)(output.writtenBits() / blockSize)); } // Note that delimiters do not get the prefix length, as it's 0. if (! isDelimiter) output.writeUnary(prefixLength); output.writeUnary(length - prefixLength); // Write the next coded suffix on output. for(int j = prefixLength; j < length; j++) { BitVector c = codeWord[char2symbol.get(s.charAt(j))]; for(long k = 0; k < c.size64(); k++) output.writeBit(c.getBoolean(k)); } if (isDelimiter) { if (DEBUG) System.err.println("First string of block " + blockStarts.size() + ": " + termCount + " (" + s + ")"); // The current word starts a new block blockStarts.add(termCount); // We do not want to rely on s being immutable. delimiters.add(new MutableString(s)); } currBuffer = 1 - currBuffer; prevTerm.replace(s); prevTermLength = length; termCount++; } output.align(); dumpStreamLength = output.writtenBits() / 8; output.close(); intervalApproximator = prefixCoder == null ? null : new ImmutableBinaryTrie<>(delimiters, new PrefixCoderTransformationStrategy(prefixCoder, char2symbol, false)); blockStarts.add(size); blockStart = blockStarts.toIntArray(); blockOffset = blockOffsets.toIntArray(); // We use a buffer of the same size of a block, hoping in fast I/O. */ dumpStream = new InputBitStream(tempDumpStreamFilename, blockSizeInBytes); (this.cache = new Object2IntLinkedOpenHashMap<>()).defaultReturnValue(-1); } /** Creates an external prefix map with block size {@link #STD_BLOCK_SIZE} and specified dump stream. * *

This constructor does not assume that {@link CharSequence} instances returned by terms.iterator() * will be distinct. Thus, it can be safely used with {@link FileLinesCollection}. * * @param terms a collection whose iterator will enumerate in lexicographical order the terms for the map. * @param dumpStreamFilename the name of the dump stream, or {@code null} for a self-contained map. */ public ImmutableExternalPrefixMap(final Iterable terms, final CharSequence dumpStreamFilename) throws IOException { this(terms, STD_BLOCK_SIZE, dumpStreamFilename); } /** Creates an external prefix map with specified block size. * *

This constructor does not assume that {@link CharSequence} instances returned by terms.iterator() * will be distinct. Thus, it can be safely used with {@link FileLinesCollection}. * * @param blockSizeInBytes the block size (in bytes). * @param terms a collection whose iterator will enumerate in lexicographical order the terms for the map. */ public ImmutableExternalPrefixMap(final Iterable terms, final int blockSizeInBytes) throws IOException { this(terms, blockSizeInBytes, null); } /** Creates an external prefix map with block size {@link #STD_BLOCK_SIZE}. * *

This constructor does not assume that {@link CharSequence} instances returned by terms.iterator() * will be distinct. Thus, it can be safely used with {@link FileLinesCollection}. * * @param terms a collection whose iterator will enumerate in lexicographical order the terms for the map. */ public ImmutableExternalPrefixMap(final Iterable terms) throws IOException { this(terms, null); } private void safelyCloseDumpStream() { try { if (this.dumpStream != null) this.dumpStream.close(); } catch (IOException ignore) {} } private void ensureNotSelfContained() { if (selfContained) throw new IllegalStateException("You cannot set the dump file of a self-contained external prefix map"); } private boolean isEncodable(final CharSequence s) { for(int i = s.length(); i-- != 0;) if (! char2symbol.containsKey(s.charAt(i))) return false; return true; } /** Sets the dump stream of this external prefix map to a given filename. * *

This method sets the dump file used by this map, and should be only * called after deserialisation, providing exactly the file generated at * creation time. Essentially anything can happen if you do not follow the rules. * *

Note that this method will attempt to close the old stream, if present. * * @param dumpStreamFilename the name of the dump file. * @see #setDumpStream(InputBitStream) */ public void setDumpStream(final CharSequence dumpStreamFilename) throws FileNotFoundException{ ensureNotSelfContained(); safelyCloseDumpStream(); iteratorIsUsable = false; final long newLength = new File(dumpStreamFilename.toString()).length(); if (newLength != dumpStreamLength) throw new IllegalArgumentException("The size of the new dump file (" + newLength + ") does not match the original length (" + dumpStreamLength + ")"); dumpStream = new InputBitStream(dumpStreamFilename.toString(), (int)(blockSize / 8)); } /** Sets the dump stream of this external prefix map to a given input bit stream. * *

This method sets the dump file used by this map, and should be only * called after deserialisation, providing a repositionable stream containing * exactly the file generated at * creation time. Essentially anything can happen if you do not follow the rules. * *

Using this method you can load an external prefix map in core memory, enjoying * the compactness of the data structure, but getting much more speed. * *

Note that this method will attemp to close the old stream, if present. * * @param dumpStream a repositionable input bit stream containing exactly the dump stream generated * at creation time. * @see #setDumpStream(CharSequence) */ public void setDumpStream(final InputBitStream dumpStream) { ensureNotSelfContained(); safelyCloseDumpStream(); iteratorIsUsable = false; this.dumpStream = dumpStream; } private void ensureStream() { if (dumpStream == null) throw new IllegalStateException("This external prefix map has been deserialised, but no dump stream has been set"); } @Override public Interval getInterval(final CharSequence prefix) { ensureStream(); // If prefix contains any character not coded by the prefix coder, we can return the empty interval. if (! isEncodable(prefix)) return Intervals.EMPTY_INTERVAL; // We recover the left extremes of the intervals where extensions of prefix could possibly lie. Interval interval = intervalApproximator.getApproximatedInterval(prefix); // System.err.println("Approximate interval: " + interval + " , terms: [" + blockStart[interval.left] + ", " + blockStart[interval.right] + "]"); if (interval == Intervals.EMPTY_INTERVAL) return interval; try { dumpStream.position(blockOffset[interval.left] * blockSize); dumpStream.readBits(0); iteratorIsUsable = false; MutableString s = new MutableString(); int suffixLength, prefixLength = -1, count = blockStart[interval.left], blockEnd = blockStart[interval.left + 1], start = -1, end = -1; /* We scan the dump file, stopping if we exhaust the block */ while(count < blockEnd) { if (prefixLength < 0) prefixLength = 0; else prefixLength = dumpStream.readUnary(); suffixLength = dumpStream.readUnary(); s.delete(prefixLength, s.length()); s.length(prefixLength + suffixLength); for(int i = 0; i < suffixLength; i++) s.charAt(i + prefixLength, symbol2char[decoder.decode(dumpStream)]); if (s.startsWith(prefix)) { start = count; break; } count++; } /* If we did not find our string, there are two possibilities: if the * interval contains one point, there is no string extending prefix. But * if the interval is larger, the first string of the second block in the * interval must be an extension of prefix. */ if (start < 0 && interval.length() == 1) return Intervals.EMPTY_INTERVAL; else start = count; end = start + 1; //assert dumpStream.readBits() <= blockSize; /* If the interval contains more than one point, the last string with * given prefix is necessarily contained in the last block, and we * must restart the search process. */ if (interval.length() > 1) { dumpStream.position(blockOffset[interval.right] * blockSize); dumpStream.readBits(0); s.length(0); end = blockStart[interval.right]; blockEnd = blockStart[interval.right + 1]; prefixLength = -1; } while(end < blockEnd) { if (prefixLength < 0) prefixLength = 0; else prefixLength = dumpStream.readUnary(); suffixLength = dumpStream.readUnary(); s.delete(prefixLength, s.length()); s.length(prefixLength + suffixLength); for(int i = 0; i < suffixLength; i++) s.charAt(i + prefixLength, symbol2char[decoder.decode(dumpStream)]); if (! s.startsWith(prefix)) break; end++; } return Interval.valueOf(start, end - 1); } catch (IOException rethrow) { throw new RuntimeException(rethrow); } } @Override protected MutableString getTerm(final int index, final MutableString s) { ensureStream(); // We perform a binary search to find the block to which s could possibly belong. int block = Arrays.binarySearch(blockStart, index); if (block < 0) block = - block - 2; try { dumpStream.position(blockOffset[block] * blockSize); dumpStream.readBits(0); iteratorIsUsable = false; int suffixLength, prefixLength = -1; for(int i = index - blockStart[block] + 1; i-- != 0;) { if (prefixLength < 0) prefixLength = 0; else prefixLength = dumpStream.readUnary(); suffixLength = dumpStream.readUnary(); s.delete(prefixLength, s.length()); s.length(prefixLength + suffixLength); for(int j = 0; j < suffixLength; j++) s.charAt(j + prefixLength, symbol2char[decoder.decode(dumpStream)]); } return s; } catch(IOException rethrow) { throw new RuntimeException(rethrow); } } private long getIndex(final Object o) { final CharSequence term = (CharSequence)o; final int cached = cache.getAndMoveToFirst(new MutableString(term)); if (cached != -1) return cached; ensureStream(); // If term contains any character not coded by the prefix coder, we can return -1 if (! isEncodable(term)) return -1; /* If term is in the map, any string extending term must follow term. Thus, * term can be in the map only if it can be found in the left block * of an approximated interval for itself. */ Interval interval = intervalApproximator.getApproximatedInterval(term); if (interval == Intervals.EMPTY_INTERVAL) return -1; try { dumpStream.position(blockOffset[interval.left] * blockSize); dumpStream.readBits(0); iteratorIsUsable = false; MutableString s = new MutableString(); int suffixLength, prefixLength = -1, count = blockStart[interval.left], blockEnd = blockStart[interval.left + 1]; /* We scan the dump file, stopping if we exhaust the block */ while(count < blockEnd) { if (prefixLength < 0) prefixLength = 0; else prefixLength = dumpStream.readUnary(); suffixLength = dumpStream.readUnary(); s.delete(prefixLength, s.length()); s.length(prefixLength + suffixLength); for(int i = 0; i < suffixLength; i++) s.charAt(i + prefixLength, symbol2char[decoder.decode(dumpStream)]); if (s.equals(term)) { if (cache.size() >= CACHE_MAX_SIZE) cache.removeFirstInt(); cache.put(s.copy(), count); return count; } count++; } return -1; } catch (IOException rethrow) { throw new RuntimeException(rethrow); } } @Override public boolean containsKey(final Object term) { return getIndex(term) != -1; } @Override public long getLong(final Object o) { final long result = getIndex(o); return result == -1 ? defRetValue : result; } /** An iterator over the dump stream. It does not use the interval approximator—it just scans the file. */ private final class DumpStreamIterator implements ObjectIterator { /** The current block being enumerated. */ private int currBlock = -1; /** The index of next term that will be returned. */ private int index; /** The mutable string used to return the result. */ final MutableString s = new MutableString(); private DumpStreamIterator() { try { dumpStream.position(0); } catch (IOException e) { throw new RuntimeException(e); } dumpStream.readBits(0); iteratorIsUsable = true; } @Override public boolean hasNext() { if (! iteratorIsUsable) throw new IllegalStateException("Get methods of this map have caused a stream repositioning"); return index < size; } @Override public CharSequence next() { if (! hasNext()) throw new NoSuchElementException(); try { final int prefixLength; if (index == blockStart[currBlock + 1]) { if (dumpStream.readBits() % blockSize != 0) dumpStream.skip(blockSize - dumpStream.readBits() % blockSize); currBlock++; prefixLength = 0; } else prefixLength = dumpStream.readUnary(); final int suffixLength = dumpStream.readUnary(); s.delete(prefixLength, s.length()); s.length(prefixLength + suffixLength); for (int i = 0; i < suffixLength; i++) s.charAt(i + prefixLength, symbol2char[decoder.decode(dumpStream)]); index++; return s; } catch (IOException e) { throw new RuntimeException(e); } } } /** Returns an iterator over the map. * *

The iterator returned by this method scans directly the dump stream. * *

Note that the returned iterator uses the same stream as all get methods. Calling such methods while * the iterator is being used will produce an {@link IllegalStateException}. * * @return an iterator over the map that just scans the dump stream. */ public ObjectIterator iterator() { return new DumpStreamIterator(); } @Override public int size() { return size; } private void writeObject(final ObjectOutputStream s) throws IOException { s.defaultWriteObject(); if (selfContained) { final FileInputStream fis = new FileInputStream(tempDumpStreamFilename); IOUtils.copy(fis, s); fis.close(); } } private void readObject(final ObjectInputStream s) throws IOException, ClassNotFoundException { s.defaultReadObject(); if (selfContained) { final File temp = File.createTempFile(this.getClass().getName(), ".dump"); temp.deleteOnExit(); tempDumpStreamFilename = temp.toString(); // TODO: propose Jakarta CopyUtils extension with length control and refactor. FileOutputStream fos = new FileOutputStream(temp); final byte[] b = new byte[64 * 1024]; int len; while((len = s.read(b)) >= 0) fos.write(b, 0, len); fos.close(); dumpStream = new InputBitStream(temp, (int)(blockSize / 8)); } (this.cache = new Object2IntLinkedOpenHashMap<>()).defaultReturnValue(-1); } @SuppressWarnings("unchecked") public static void main(final String[] arg) throws ClassNotFoundException, IOException, JSAPException, SecurityException, NoSuchMethodException { final SimpleJSAP jsap = new SimpleJSAP(ImmutableExternalPrefixMap.class.getName(), "Builds an external prefix map reading from standard input a newline-separated list of sorted terms or a serialised term list. If the dump stream name is not specified, the map will be self-contained.\n\n" + "Note that if you read terms from stdin or from a serialized object all terms will have to be loaded in memory.", new Parameter[] { new FlaggedOption("blockSize", JSAP.INTSIZE_PARSER, (STD_BLOCK_SIZE / 1024) + "Ki", JSAP.NOT_REQUIRED, 'b', "block-size", "The size of a block in the dump stream."), new Switch("serialised", 's', "serialised", "The data source (file or standard input) provides a serialised java.util.List of terms."), new Switch("synchronised", 'S', "synchronised", "The serialised map will be synchronised."), new Switch("zipped", 'z', "zipped", "Standard input is compressed in gzip format."), new FlaggedOption("termFile", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'o', "offline", "Read terms from this file instead of standard input."), new FlaggedOption("encoding", ForNameStringParser.getParser(Charset.class), "UTF-8", JSAP.NOT_REQUIRED, 'e', "encoding", "The term list encoding."), new UnflaggedOption("map", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the serialised map."), new UnflaggedOption("dump", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, JSAP.NOT_GREEDY, "An optional dump stream (the resulting map will not be self-contained).") } ); JSAPResult jsapResult = jsap.parse(arg); if (jsap.messagePrinted()) return; Collection termList; final String termFile = jsapResult.getString("termFile"); final Charset encoding = (Charset)jsapResult.getObject("encoding"); final boolean zipped = jsapResult.getBoolean("zipped"); final boolean serialised = jsapResult.getBoolean("serialised"); final boolean synchronised = jsapResult.getBoolean("synchronised"); if (zipped && serialised) throw new IllegalArgumentException("The zipped and serialised options are incompatible"); if (serialised) termList = (List) (termFile != null ? BinIO.loadObject(termFile) : BinIO.loadObject(System.in)); else { if (termFile != null) termList = new FileLinesCollection(termFile, encoding.name(), zipped); else { final ObjectArrayList list = new ObjectArrayList<>(); termList = list; final FastBufferedReader terms = new FastBufferedReader(new InputStreamReader( zipped ? new GZIPInputStream(System.in) : System.in, encoding.name())); final MutableString term = new MutableString(); while(terms.readLine(term) != null) list.add(term.copy()); terms.close(); } } final ImmutableExternalPrefixMap immutableExternalPrefixMap = new ImmutableExternalPrefixMap(termList, jsapResult.getInt("blockSize"), jsapResult.getString("dump")); BinIO.storeObject(synchronised ? StringMaps.synchronize(immutableExternalPrefixMap) : immutableExternalPrefixMap, jsapResult.getString("map")); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy