src.it.unimi.dsi.util.ImmutableExternalPrefixMap Maven / Gradle / Ivy
package it.unimi.dsi.util;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.zip.GZIPInputStream;
import org.apache.commons.io.IOUtils;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import com.martiansoftware.jsap.stringparsers.ForNameStringParser;
/*
* DSI utilities
*
* Copyright (C) 2005-2017 Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see .
*
*/
import it.unimi.dsi.bits.BitVector;
import it.unimi.dsi.bits.PrefixCoderTransformationStrategy;
import it.unimi.dsi.compression.Decoder;
import it.unimi.dsi.compression.HuTuckerCodec;
import it.unimi.dsi.compression.PrefixCodec;
import it.unimi.dsi.compression.PrefixCoder;
import it.unimi.dsi.fastutil.booleans.BooleanIterator;
import it.unimi.dsi.fastutil.chars.Char2IntOpenHashMap;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.objects.Object2IntLinkedOpenHashMap;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.ObjectIterator;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.FileLinesCollection;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.lang.MutableString;
// TODO: implement interfaces correctly (e.g., using the iterator)
/** An immutable prefix map mostly stored in external memory.
*
* Instances of this class write on a dump stream the string in the domain
* of the prefix map. More precisely, the dump stream is formed by blocks, and each
* block (with user-definable length, possibly the size of a basic disk I/O operation)
* is filled as much as possible with strings front coded and compressed with a
* {@link it.unimi.dsi.compression.HuTuckerCodec}.
* Each block starts with the length of the first string in unary, followed by the encoding of the
* string. Then, for each string we write in unary the length of the common prefix (in characters)
* with the previous string, the length of the remaining suffix (in characters)
* and finally the encoded suffix. Note that if the encoding of a string is longer than a block,
* the string will occupy more than one block.
*
*
We keep track using an {@link ImmutableBinaryTrie}
* of the strings at the start of each block in {@link HuTuckerCodec Hu–Tucker coding}:
* thus, we are able to retrieve the interval corresponding
* to a given prefix by calling {@link ImmutableBinaryTrie#getApproximatedInterval(BooleanIterator) getApproximatedInterval()}
* and scanning at most two blocks.
*
*
Self-contained or non-self-contained
*
* There are two kinds of external prefix maps: self-contained and non-self-contained.
* In the first case, you get a serialised object that you can load at any time. The dump
* stream is serialised with the object and expanded at each deserialisation in the Java temporary directory.
* If you deserialise a map several times, you will get correspondingly many copies of
* the dump stream in the temporary directory. The dump streams are deleted when the JVM
* exits. This mechanism is not very efficient, but since this class implements several
* interfaces it is essential that clients can make things work in a standard way.
*
*
Alternatively, you can give at creation time a filename for the dump stream.
* The resulting non-self-contained external prefix map
* can be serialised, but after deserialisation
* you need to set the {@linkplain #setDumpStream(CharSequence) dump stream filename}
* or even directly the {@linkplain #setDumpStream(InputBitStream) dump stream} (for instance, to
* an {@linkplain it.unimi.dsi.io.InputBitStream#InputBitStream(byte[]) input bit stream
* wrapping a byte array where the dump stream has been loaded}). You can deserialise many
* copies of an external prefix map, letting all copies share the same dump stream.
*
*
This data structure is not synchronised, and concurrent reads may cause problems
* because of clashes in the usage of the underlying input bit stream. It would not
* be a good idea in any case to open a new stream for each caller, as that would
* certainly lead to disk thrashing.
*
*
The {@linkplain #main(String[]) main method} of this class
* helps in building large external prefix maps.
*
* @author Sebastiano Vigna
* @since 0.9.3
*/
public class ImmutableExternalPrefixMap extends AbstractPrefixMap implements Serializable {
private final static boolean DEBUG = false;
private final static boolean ASSERTS = false;
public static final long serialVersionUID = 1L;
/** The standard block size (in bytes). */
public final static int STD_BLOCK_SIZE = 1024;
/** The maximum number of entry in the cache map. */
public static final int CACHE_MAX_SIZE = 1024;
/** The in-memory data structure used to approximate intervals.. */
final protected ImmutableBinaryTrie intervalApproximator;
/** The block size of this (in bits). */
final protected long blockSize;
/** A decoder used to read data from the dump stream. */
final protected Decoder decoder;
/** A map (given by an array) from symbols in the coder to characters. */
final protected char[] symbol2char;
/** A map from characters to symbols of the coder. */
final protected Char2IntOpenHashMap char2symbol;
/** The number of terms in this map. */
final protected int size;
/** The index of the first word in each block, plus an additional entry containing {@link #size}. */
final protected int[] blockStart;
/** An array parallel to {@link #blockStart} giving the offset in blocks in the dump file
* of the corresponding word in {@link #blockStart}. If there are no overflows, this will just
* be an initial segment of the natural numbers, but overflows cause jumps. */
final protected int[] blockOffset;
/** Whether this map is self-contained. */
final protected boolean selfContained;
/** The length in bytes of the dump stream, both for serialisation purposes and for minimal checks. */
private final long dumpStreamLength;
/** The filename of the temporary dump stream, or of the dump stream created by the constructor or by readObject(). */
private transient String tempDumpStreamFilename;
/** If true, the creation of the last DumpStreamIterator
was not
* followed by a call to any get method. */
protected transient boolean iteratorIsUsable;
/** A reference to the dump stream. */
protected transient InputBitStream dumpStream;
/** A cache for the most recent queries */
protected transient Object2IntLinkedOpenHashMap cache;
/** Creates an external prefix map with specified block size and dump stream.
*
* This constructor does not assume that {@link CharSequence} instances returned by terms.iterator()
* will be distinct. Thus, it can be safely used with {@link FileLinesCollection}.
*
* @param terms an iterable whose iterator will enumerate in lexicographical order the terms for the map.
* @param blockSizeInBytes the block size (in bytes).
* @param dumpStreamFilename the name of the dump stream, or {@code null} for a self-contained map.
*/
public ImmutableExternalPrefixMap(final Iterable extends CharSequence> terms, final int blockSizeInBytes, final CharSequence dumpStreamFilename) throws IOException {
this.blockSize = blockSizeInBytes * 8;
this.selfContained = dumpStreamFilename == null;
// First of all, we gather frequencies for all Unicode characters
int[] frequency = new int[Character.MAX_VALUE + 1];
int maxWordLength = 0;
CharSequence s;
int count = 0;
final MutableString prevTerm = new MutableString();
for(Iterator extends CharSequence> i = terms.iterator(); i.hasNext();) {
s = i.next();
maxWordLength = Math.max(s.length(), maxWordLength);
for(int j = s.length(); j-- != 0;) frequency[s.charAt(j)]++;
final int cmp = prevTerm.compareTo(s);
if (count > 0 && cmp >= 0) throw new IllegalArgumentException("The provided term collection " + (cmp == 0 ? "contains duplicates" : "is not sorted") + " [" + prevTerm + ", " + s + "]");
count++;
prevTerm.replace(s);
}
size = count;
// Then, we compute the number of actually used characters
count = 0;
for(int i = frequency.length; i-- != 0;) if (frequency[i] != 0) count++;
/* Now we remap used characters in f, building at the same time maps from
* symbol to characters and from characters to symbols. */
int[] packedFrequency = new int[count];
symbol2char = new char[count];
char2symbol = new Char2IntOpenHashMap(count);
char2symbol.defaultReturnValue(-1);
for(int i = frequency.length, k = count; i-- != 0;) {
if (frequency[i] != 0) {
packedFrequency[--k] = frequency[i];
symbol2char[k] = (char)i;
char2symbol.put((char)i, k);
}
}
char2symbol.trim();
// We now build the coder used to code the strings
final PrefixCoder prefixCoder;
final PrefixCodec codec;
final BitVector[] codeWord;
if (packedFrequency.length != 0) {
codec = new HuTuckerCodec(packedFrequency);
prefixCoder = codec.coder();
decoder = codec.decoder();
codeWord = prefixCoder.codeWords();
}
else {
// This handles the case of a collection without words
codec = null;
prefixCoder = null;
decoder = null;
codeWord = null;
}
packedFrequency = frequency = null;
// We now compress all strings using the given codec mixed with front coding
final OutputBitStream output;
if (selfContained) {
final File temp = File.createTempFile(this.getClass().getName(), ".dump");
temp.deleteOnExit();
tempDumpStreamFilename = temp.toString();
output = new OutputBitStream(temp, blockSizeInBytes);
}
else output = new OutputBitStream(tempDumpStreamFilename = dumpStreamFilename.toString(), blockSizeInBytes);
// This array will contain the delimiting words (the ones at the start of each block)
boolean isDelimiter;
int length, prevTermLength = 0, bits;
int prefixLength = 0, termCount = 0;
int currBuffer = 0;
final IntArrayList blockStarts = new IntArrayList();
final IntArrayList blockOffsets = new IntArrayList();
final ObjectArrayList delimiters = new ObjectArrayList<>();
prevTerm.length(0);
for(Iterator> i = terms.iterator(); i.hasNext();) {
s = (CharSequence) i.next();
length = s.length();
isDelimiter = false;
// We compute the common prefix and the number of bits that are necessary to code the next term.
bits = 0;
for(prefixLength = 0; prefixLength < length && prefixLength < prevTermLength && prevTerm.charAt(prefixLength) == s.charAt(prefixLength); prefixLength++);
for(int j = prefixLength; j < length; j++) bits += codeWord[char2symbol.get(s.charAt(j))].length();
//if (bits + length + 1 > blockSize) throw new IllegalArgumentException("The string \"" + s + "\" is too long to be encoded with block size " + blockSizeInBytes);
// If the next term would overflow the block, and we are not at the start of a block, we align.
if (output.writtenBits() % blockSize != 0 && output.writtenBits() / blockSize != (output.writtenBits() + (length - prefixLength + 1) + (prefixLength + 1) + bits - 1) / blockSize) {
// We align by writing 0es.
if (DEBUG) System.err.println("Aligning away " + (blockSize - output.writtenBits() % blockSize) + " bits...");
for(int j = (int)(blockSize - output.writtenBits() % blockSize); j-- != 0;) output.writeBit(0);
if (ASSERTS) assert output.writtenBits() % blockSize == 0;
}
if (output.writtenBits() % blockSize == 0) {
isDelimiter = true;
prefixLength = 0;
blockOffsets.add((int)(output.writtenBits() / blockSize));
}
// Note that delimiters do not get the prefix length, as it's 0.
if (! isDelimiter) output.writeUnary(prefixLength);
output.writeUnary(length - prefixLength);
// Write the next coded suffix on output.
for(int j = prefixLength; j < length; j++) {
BitVector c = codeWord[char2symbol.get(s.charAt(j))];
for(long k = 0; k < c.size64(); k++) output.writeBit(c.getBoolean(k));
}
if (isDelimiter) {
if (DEBUG) System.err.println("First string of block " + blockStarts.size() + ": " + termCount + " (" + s + ")");
// The current word starts a new block
blockStarts.add(termCount);
// We do not want to rely on s being immutable.
delimiters.add(new MutableString(s));
}
currBuffer = 1 - currBuffer;
prevTerm.replace(s);
prevTermLength = length;
termCount++;
}
output.align();
dumpStreamLength = output.writtenBits() / 8;
output.close();
intervalApproximator = prefixCoder == null ? null : new ImmutableBinaryTrie<>(delimiters, new PrefixCoderTransformationStrategy(prefixCoder, char2symbol, false));
blockStarts.add(size);
blockStart = blockStarts.toIntArray();
blockOffset = blockOffsets.toIntArray();
// We use a buffer of the same size of a block, hoping in fast I/O. */
dumpStream = new InputBitStream(tempDumpStreamFilename, blockSizeInBytes);
(this.cache = new Object2IntLinkedOpenHashMap<>()).defaultReturnValue(-1);
}
/** Creates an external prefix map with block size {@link #STD_BLOCK_SIZE} and specified dump stream.
*
* This constructor does not assume that {@link CharSequence} instances returned by terms.iterator()
* will be distinct. Thus, it can be safely used with {@link FileLinesCollection}.
*
* @param terms a collection whose iterator will enumerate in lexicographical order the terms for the map.
* @param dumpStreamFilename the name of the dump stream, or {@code null} for a self-contained map.
*/
public ImmutableExternalPrefixMap(final Iterable extends CharSequence> terms, final CharSequence dumpStreamFilename) throws IOException {
this(terms, STD_BLOCK_SIZE, dumpStreamFilename);
}
/** Creates an external prefix map with specified block size.
*
*
This constructor does not assume that {@link CharSequence} instances returned by terms.iterator()
* will be distinct. Thus, it can be safely used with {@link FileLinesCollection}.
*
* @param blockSizeInBytes the block size (in bytes).
* @param terms a collection whose iterator will enumerate in lexicographical order the terms for the map.
*/
public ImmutableExternalPrefixMap(final Iterable extends CharSequence> terms, final int blockSizeInBytes) throws IOException {
this(terms, blockSizeInBytes, null);
}
/** Creates an external prefix map with block size {@link #STD_BLOCK_SIZE}.
*
*
This constructor does not assume that {@link CharSequence} instances returned by terms.iterator()
* will be distinct. Thus, it can be safely used with {@link FileLinesCollection}.
*
* @param terms a collection whose iterator will enumerate in lexicographical order the terms for the map.
*/
public ImmutableExternalPrefixMap(final Iterable extends CharSequence> terms) throws IOException {
this(terms, null);
}
private void safelyCloseDumpStream() {
try {
if (this.dumpStream != null) this.dumpStream.close();
}
catch (IOException ignore) {}
}
private void ensureNotSelfContained() {
if (selfContained) throw new IllegalStateException("You cannot set the dump file of a self-contained external prefix map");
}
private boolean isEncodable(final CharSequence s) {
for(int i = s.length(); i-- != 0;) if (! char2symbol.containsKey(s.charAt(i))) return false;
return true;
}
/** Sets the dump stream of this external prefix map to a given filename.
*
*
This method sets the dump file used by this map, and should be only
* called after deserialisation, providing exactly the file generated at
* creation time. Essentially anything can happen if you do not follow the rules.
*
*
Note that this method will attempt to close the old stream, if present.
*
* @param dumpStreamFilename the name of the dump file.
* @see #setDumpStream(InputBitStream)
*/
public void setDumpStream(final CharSequence dumpStreamFilename) throws FileNotFoundException{
ensureNotSelfContained();
safelyCloseDumpStream();
iteratorIsUsable = false;
final long newLength = new File(dumpStreamFilename.toString()).length();
if (newLength != dumpStreamLength)
throw new IllegalArgumentException("The size of the new dump file (" + newLength + ") does not match the original length (" + dumpStreamLength + ")");
dumpStream = new InputBitStream(dumpStreamFilename.toString(), (int)(blockSize / 8));
}
/** Sets the dump stream of this external prefix map to a given input bit stream.
*
*
This method sets the dump file used by this map, and should be only
* called after deserialisation, providing a repositionable stream containing
* exactly the file generated at
* creation time. Essentially anything can happen if you do not follow the rules.
*
*
Using this method you can load an external prefix map in core memory, enjoying
* the compactness of the data structure, but getting much more speed.
*
*
Note that this method will attemp to close the old stream, if present.
*
* @param dumpStream a repositionable input bit stream containing exactly the dump stream generated
* at creation time.
* @see #setDumpStream(CharSequence)
*/
public void setDumpStream(final InputBitStream dumpStream) {
ensureNotSelfContained();
safelyCloseDumpStream();
iteratorIsUsable = false;
this.dumpStream = dumpStream;
}
private void ensureStream() {
if (dumpStream == null) throw new IllegalStateException("This external prefix map has been deserialised, but no dump stream has been set");
}
@Override
public Interval getInterval(final CharSequence prefix) {
ensureStream();
// If prefix contains any character not coded by the prefix coder, we can return the empty interval.
if (! isEncodable(prefix)) return Intervals.EMPTY_INTERVAL;
// We recover the left extremes of the intervals where extensions of prefix could possibly lie.
Interval interval = intervalApproximator.getApproximatedInterval(prefix);
// System.err.println("Approximate interval: " + interval + " , terms: [" + blockStart[interval.left] + ", " + blockStart[interval.right] + "]");
if (interval == Intervals.EMPTY_INTERVAL) return interval;
try {
dumpStream.position(blockOffset[interval.left] * blockSize);
dumpStream.readBits(0);
iteratorIsUsable = false;
MutableString s = new MutableString();
int suffixLength, prefixLength = -1, count = blockStart[interval.left], blockEnd = blockStart[interval.left + 1], start = -1, end = -1;
/* We scan the dump file, stopping if we exhaust the block */
while(count < blockEnd) {
if (prefixLength < 0) prefixLength = 0;
else prefixLength = dumpStream.readUnary();
suffixLength = dumpStream.readUnary();
s.delete(prefixLength, s.length());
s.length(prefixLength + suffixLength);
for(int i = 0; i < suffixLength; i++) s.charAt(i + prefixLength, symbol2char[decoder.decode(dumpStream)]);
if (s.startsWith(prefix)) {
start = count;
break;
}
count++;
}
/* If we did not find our string, there are two possibilities: if the
* interval contains one point, there is no string extending prefix. But
* if the interval is larger, the first string of the second block in the
* interval must be an extension of prefix. */
if (start < 0 && interval.length() == 1) return Intervals.EMPTY_INTERVAL;
else start = count;
end = start + 1;
//assert dumpStream.readBits() <= blockSize;
/* If the interval contains more than one point, the last string with
* given prefix is necessarily contained in the last block, and we
* must restart the search process. */
if (interval.length() > 1) {
dumpStream.position(blockOffset[interval.right] * blockSize);
dumpStream.readBits(0);
s.length(0);
end = blockStart[interval.right];
blockEnd = blockStart[interval.right + 1];
prefixLength = -1;
}
while(end < blockEnd) {
if (prefixLength < 0) prefixLength = 0;
else prefixLength = dumpStream.readUnary();
suffixLength = dumpStream.readUnary();
s.delete(prefixLength, s.length());
s.length(prefixLength + suffixLength);
for(int i = 0; i < suffixLength; i++) s.charAt(i + prefixLength, symbol2char[decoder.decode(dumpStream)]);
if (! s.startsWith(prefix)) break;
end++;
}
return Interval.valueOf(start, end - 1);
} catch (IOException rethrow) {
throw new RuntimeException(rethrow);
}
}
@Override
protected MutableString getTerm(final int index, final MutableString s) {
ensureStream();
// We perform a binary search to find the block to which s could possibly belong.
int block = Arrays.binarySearch(blockStart, index);
if (block < 0) block = - block - 2;
try {
dumpStream.position(blockOffset[block] * blockSize);
dumpStream.readBits(0);
iteratorIsUsable = false;
int suffixLength, prefixLength = -1;
for(int i = index - blockStart[block] + 1; i-- != 0;) {
if (prefixLength < 0) prefixLength = 0;
else prefixLength = dumpStream.readUnary();
suffixLength = dumpStream.readUnary();
s.delete(prefixLength, s.length());
s.length(prefixLength + suffixLength);
for(int j = 0; j < suffixLength; j++) s.charAt(j + prefixLength, symbol2char[decoder.decode(dumpStream)]);
}
return s;
}
catch(IOException rethrow) {
throw new RuntimeException(rethrow);
}
}
private long getIndex(final Object o) {
final CharSequence term = (CharSequence)o;
final int cached = cache.getAndMoveToFirst(new MutableString(term));
if (cached != -1) return cached;
ensureStream();
// If term contains any character not coded by the prefix coder, we can return -1
if (! isEncodable(term)) return -1;
/* If term is in the map, any string extending term must follow term. Thus,
* term can be in the map only if it can be found in the left block
* of an approximated interval for itself. */
Interval interval = intervalApproximator.getApproximatedInterval(term);
if (interval == Intervals.EMPTY_INTERVAL) return -1;
try {
dumpStream.position(blockOffset[interval.left] * blockSize);
dumpStream.readBits(0);
iteratorIsUsable = false;
MutableString s = new MutableString();
int suffixLength, prefixLength = -1, count = blockStart[interval.left], blockEnd = blockStart[interval.left + 1];
/* We scan the dump file, stopping if we exhaust the block */
while(count < blockEnd) {
if (prefixLength < 0) prefixLength = 0;
else prefixLength = dumpStream.readUnary();
suffixLength = dumpStream.readUnary();
s.delete(prefixLength, s.length());
s.length(prefixLength + suffixLength);
for(int i = 0; i < suffixLength; i++) s.charAt(i + prefixLength, symbol2char[decoder.decode(dumpStream)]);
if (s.equals(term)) {
if (cache.size() >= CACHE_MAX_SIZE) cache.removeFirstInt();
cache.put(s.copy(), count);
return count;
}
count++;
}
return -1;
}
catch (IOException rethrow) {
throw new RuntimeException(rethrow);
}
}
@Override
public boolean containsKey(final Object term) {
return getIndex(term) != -1;
}
@Override
public long getLong(final Object o) {
final long result = getIndex(o);
return result == -1 ? defRetValue : result;
}
/** An iterator over the dump stream. It does not use the interval approximator—it just scans the file. */
private final class DumpStreamIterator implements ObjectIterator {
/** The current block being enumerated. */
private int currBlock = -1;
/** The index of next term that will be returned. */
private int index;
/** The mutable string used to return the result. */
final MutableString s = new MutableString();
private DumpStreamIterator() {
try {
dumpStream.position(0);
}
catch (IOException e) {
throw new RuntimeException(e);
}
dumpStream.readBits(0);
iteratorIsUsable = true;
}
@Override
public boolean hasNext() {
if (! iteratorIsUsable) throw new IllegalStateException("Get methods of this map have caused a stream repositioning");
return index < size;
}
@Override
public CharSequence next() {
if (! hasNext()) throw new NoSuchElementException();
try {
final int prefixLength;
if (index == blockStart[currBlock + 1]) {
if (dumpStream.readBits() % blockSize != 0) dumpStream.skip(blockSize - dumpStream.readBits() % blockSize);
currBlock++;
prefixLength = 0;
}
else prefixLength = dumpStream.readUnary();
final int suffixLength = dumpStream.readUnary();
s.delete(prefixLength, s.length());
s.length(prefixLength + suffixLength);
for (int i = 0; i < suffixLength; i++)
s.charAt(i + prefixLength, symbol2char[decoder.decode(dumpStream)]);
index++;
return s;
}
catch (IOException e) {
throw new RuntimeException(e);
}
}
}
/** Returns an iterator over the map.
*
* The iterator returned by this method scans directly the dump stream.
*
*
Note that the returned iterator uses the same stream as all get methods. Calling such methods while
* the iterator is being used will produce an {@link IllegalStateException}.
*
* @return an iterator over the map that just scans the dump stream.
*/
public ObjectIterator iterator() {
return new DumpStreamIterator();
}
@Override
public int size() {
return size;
}
private void writeObject(final ObjectOutputStream s) throws IOException {
s.defaultWriteObject();
if (selfContained) {
final FileInputStream fis = new FileInputStream(tempDumpStreamFilename);
IOUtils.copy(fis, s);
fis.close();
}
}
private void readObject(final ObjectInputStream s) throws IOException, ClassNotFoundException {
s.defaultReadObject();
if (selfContained) {
final File temp = File.createTempFile(this.getClass().getName(), ".dump");
temp.deleteOnExit();
tempDumpStreamFilename = temp.toString();
// TODO: propose Jakarta CopyUtils extension with length control and refactor.
FileOutputStream fos = new FileOutputStream(temp);
final byte[] b = new byte[64 * 1024];
int len;
while((len = s.read(b)) >= 0) fos.write(b, 0, len); fos.close();
dumpStream = new InputBitStream(temp, (int)(blockSize / 8));
}
(this.cache = new Object2IntLinkedOpenHashMap<>()).defaultReturnValue(-1);
}
@SuppressWarnings("unchecked")
public static void main(final String[] arg) throws ClassNotFoundException, IOException, JSAPException, SecurityException, NoSuchMethodException {
final SimpleJSAP jsap = new SimpleJSAP(ImmutableExternalPrefixMap.class.getName(),
"Builds an external prefix map reading from standard input a newline-separated list of sorted terms or a serialised term list. If the dump stream name is not specified, the map will be self-contained.\n\n" +
"Note that if you read terms from stdin or from a serialized object all terms will have to be loaded in memory.",
new Parameter[] {
new FlaggedOption("blockSize", JSAP.INTSIZE_PARSER, (STD_BLOCK_SIZE / 1024) + "Ki", JSAP.NOT_REQUIRED, 'b', "block-size", "The size of a block in the dump stream."),
new Switch("serialised", 's', "serialised", "The data source (file or standard input) provides a serialised java.util.List of terms."),
new Switch("synchronised", 'S', "synchronised", "The serialised map will be synchronised."),
new Switch("zipped", 'z', "zipped", "Standard input is compressed in gzip format."),
new FlaggedOption("termFile", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'o', "offline", "Read terms from this file instead of standard input."),
new FlaggedOption("encoding", ForNameStringParser.getParser(Charset.class), "UTF-8", JSAP.NOT_REQUIRED, 'e', "encoding", "The term list encoding."),
new UnflaggedOption("map", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the serialised map."),
new UnflaggedOption("dump", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, JSAP.NOT_GREEDY, "An optional dump stream (the resulting map will not be self-contained).")
}
);
JSAPResult jsapResult = jsap.parse(arg);
if (jsap.messagePrinted()) return;
Collection extends CharSequence> termList;
final String termFile = jsapResult.getString("termFile");
final Charset encoding = (Charset)jsapResult.getObject("encoding");
final boolean zipped = jsapResult.getBoolean("zipped");
final boolean serialised = jsapResult.getBoolean("serialised");
final boolean synchronised = jsapResult.getBoolean("synchronised");
if (zipped && serialised) throw new IllegalArgumentException("The zipped and serialised options are incompatible");
if (serialised) termList = (List extends CharSequence>) (termFile != null ? BinIO.loadObject(termFile) : BinIO.loadObject(System.in));
else {
if (termFile != null) termList = new FileLinesCollection(termFile, encoding.name(), zipped);
else {
final ObjectArrayList list = new ObjectArrayList<>();
termList = list;
final FastBufferedReader terms = new FastBufferedReader(new InputStreamReader(
zipped ? new GZIPInputStream(System.in) : System.in, encoding.name()));
final MutableString term = new MutableString();
while(terms.readLine(term) != null) list.add(term.copy());
terms.close();
}
}
final ImmutableExternalPrefixMap immutableExternalPrefixMap = new ImmutableExternalPrefixMap(termList, jsapResult.getInt("blockSize"), jsapResult.getString("dump"));
BinIO.storeObject(synchronised ? StringMaps.synchronize(immutableExternalPrefixMap) : immutableExternalPrefixMap, jsapResult.getString("map"));
}
}