
src.it.unimi.dsi.util.FrontCodedStringList Maven / Gradle / Ivy
/*
* DSI utilities
*
* Copyright (C) 2002-2023 Sebastiano Vigna
*
* This program and the accompanying materials are made available under the
* terms of the GNU Lesser General Public License v2.1 or later,
* which is available at
* http://www.gnu.org/licenses/old-licenses/lgpl-2.1-standalone.html,
* or the Apache Software License 2.0, which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE.
*
* SPDX-License-Identifier: LGPL-2.1-or-later OR Apache-2.0
*/
package it.unimi.dsi.util;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.nio.charset.Charset;
import java.util.Collection;
import java.util.Iterator;
import java.util.RandomAccess;
import java.util.zip.GZIPInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Charsets;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import com.martiansoftware.jsap.stringparsers.ForNameStringParser;
import com.martiansoftware.jsap.stringparsers.IntSizeStringParser;
import it.unimi.dsi.fastutil.bytes.ByteArrayFrontCodedList;
import it.unimi.dsi.fastutil.chars.CharArrayFrontCodedList;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.objects.AbstractObjectList;
import it.unimi.dsi.fastutil.objects.ObjectIterator;
import it.unimi.dsi.fastutil.objects.ObjectListIterator;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.LineIterator;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.logging.ProgressLogger;
/**
* Compact storage of strings using front-coding compression (also known as compression by prefix
* omission).
*
*
* This class stores a list of strings using front-coding (also known as prefix-omission)
* compression; the compression will be reasonable only if the list is sorted, but you could also
* use instances of this class just as a handy way to manage a large amount of strings. It
* implements an immutable {@link it.unimi.dsi.fastutil.objects.ObjectList} that returns the
* i-th string (as a {@link MutableString}) when the {@link #get(int)} method is called
* with argument i. The returned mutable string may be freely modified.
*
*
* As a commodity, this class provides a main method that reads from standard input a sequence of
* newline-separated strings, and writes a corresponding serialized front-coded string list.
*
*
Implementation Details
*
*
* To store the list of strings, we use either a UTF-8 coded {@link ByteArrayFrontCodedList}, or a
* {@link CharArrayFrontCodedList}, depending on the value of the
* {@link #FrontCodedStringList(Iterator,int,boolean) utf8} parameter at creation time. In the first
* case, if the strings are ASCII-oriented the resulting array will be much smaller, but access
* times will increase manifold, as each string must be UTF-8 decoded before being returned.
*/
public class FrontCodedStringList extends AbstractObjectList implements RandomAccess, Serializable {
public static final long serialVersionUID = 1;
/** The underlying {@link ByteArrayFrontCodedList}, or {@code null}.*/
protected final ByteArrayFrontCodedList byteFrontCodedList;
/** The underlying {@link CharArrayFrontCodedList}, or {@code null}.*/
protected final CharArrayFrontCodedList charFrontCodedList;
/** Whether this front-coded list is UTF-8 encoded. */
protected final boolean utf8;
/** Creates a new front-coded string list containing the character sequences returned by the given iterator.
*
* @param words an iterator returning {@linkplain CharSequence character sequences}.
* @param ratio the desired ratio.
* @param utf8 if true, the strings will be stored as UTF-8 byte arrays.
*/
public FrontCodedStringList(final Iterator extends CharSequence> words, final int ratio, final boolean utf8) {
this.utf8 = utf8;
if (utf8) {
byteFrontCodedList = new ByteArrayFrontCodedList(
new ObjectIterator() {
@Override
public boolean hasNext() { return words.hasNext(); }
@SuppressWarnings("null")
@Override
public byte[] next() {
return words.next().toString().getBytes(Charsets.UTF_8);
}
},
ratio);
charFrontCodedList = null;
}
else {
charFrontCodedList = new CharArrayFrontCodedList(
new ObjectIterator() {
@Override
public boolean hasNext() { return words.hasNext(); }
@Override
public char[] next() {
final CharSequence s = words.next();
int i = s.length();
final char[] a = new char[i];
while(i-- != 0) a[i] = s.charAt(i);
return a;
}
},
ratio);
byteFrontCodedList = null;
}
}
/** Creates a new front-coded string list containing the character sequences contained in the given collection.
*
* @param c a collection containing {@linkplain CharSequence character sequences}.
* @param ratio the desired ratio.
* @param utf8 if true, the strings will be stored as UTF-8 byte arrays.
*/
public FrontCodedStringList(final Collection extends CharSequence> c, final int ratio, final boolean utf8) {
this(c.iterator(), ratio, utf8);
}
/** Returns whether this front-coded string list is storing its strings as UTF-8 encoded bytes.
*
* @return true if this front-coded string list is keeping its data as an array of UTF-8 encoded bytes.
*/
public boolean utf8() {
return utf8;
}
/** Returns the ratio of the underlying front-coded list.
*
* @return the ratio of the underlying front-coded list.
*/
public int ratio() {
return utf8 ? byteFrontCodedList.ratio() : charFrontCodedList.ratio();
}
/**
* Returns the element at the specified position in this front-coded string list as a mutable
* string.
*
* @param index an index in the list.
* @return a {@link MutableString} that will contain the string at the specified position. The
* string may be freely modified.
*/
@Override
public MutableString get(final int index) {
return MutableString.wrap(utf8
? byte2Char(byteFrontCodedList.getArray(index), null)
: charFrontCodedList.getArray(index));
}
/**
* Returns the element at the specified position in this front-coded string list by storing it in a
* mutable string.
*
* @param index an index in the list.
* @param s a mutable string that will contain the string at the specified position.
*/
public void get(final int index, final MutableString s) {
if (utf8) {
final byte[] a = byteFrontCodedList.getArray(index);
s.length(countUTF8Chars(a));
byte2Char(a, s.array());
}
else {
s.length(s.array().length);
int res = charFrontCodedList.get(index, s.array());
if (res < 0) {
s.length(s.array().length - res);
res = charFrontCodedList.get(index, s.array());
}
else s.length(res);
}
}
/* The following methods are highly optimized UTF-8 converters exploiting
the fact that since it was ourselves in the first place who created the
coding, we can be sure it is correct. */
protected static int countUTF8Chars(final byte[] a) {
final int length = a.length;
int result = 0, b;
for(int i = 0; i < length; i++) {
b = (a[i] & 0xFF) >> 4;
if (b < 8) result++;
else if (b < 14) {
result++;
i++;
}
else if (b < 15) {
result++;
i += 2;
}
else {
// Surrogate pair (yuck!)
result += 2;
i += 4;
}
}
return result;
}
protected static char[] byte2Char(final byte[] a, char[] s) {
final int length = a.length;
if (s == null) s = new char[countUTF8Chars(a)];
int b, c, d, t;
for(int i = 0, j = 0; i < length; i++) {
b = a[i] & 0xFF;
t = b >> 4;
if (t < 8) s[j++] = (char)b;
else if (t < 14) {
c = a[++i] & 0xFF;
if ((c & 0xC0) != 0x80) throw new IllegalStateException("Malformed internal UTF-8 encoding");
s[j++] = (char)(((b & 0x1F) << 6) | (c & 0x3F));
}
else if (t < 15){
c = a[++i] & 0xFF;
d = a[++i];
if ((c & 0xC0) != 0x80 || (d & 0xC0) != 0x80) throw new IllegalStateException("Malformed internal UTF-8 encoding");
s[j++] = (char)(((b & 0x0F) << 12) | ((c & 0x3F) << 6) | ((d & 0x3F) << 0));
}
else {
// Surrogate pair (yuck!)
final String surrogatePair = new String(a, i, 4, Charsets.UTF_8);
s[j++] = surrogatePair.charAt(0);
s[j++] = surrogatePair.charAt(1);
i += 3;
}
}
return s;
}
@Override
public ObjectListIterator listIterator(final int k) { return new ObjectListIterator() {
ObjectListIterator> i = utf8 ? byteFrontCodedList.listIterator(k) : charFrontCodedList.listIterator(k);
@Override
public boolean hasNext() { return i.hasNext(); }
@Override
public boolean hasPrevious() { return i.hasPrevious(); }
@Override
public MutableString next() { return MutableString.wrap(utf8 ? byte2Char((byte[])i.next(), null) : (char[])i.next()); }
@Override
public MutableString previous() { return MutableString.wrap(utf8 ? byte2Char((byte[])i.previous(), null) :(char[])i.previous()); }
@Override
public int nextIndex() { return i.nextIndex(); }
@Override
public int previousIndex() { return i.previousIndex(); }
};
}
@Override
public int size() { return utf8 ? byteFrontCodedList.size() : charFrontCodedList.size(); }
public static void main(final String[] arg) throws IOException, JSAPException, NoSuchMethodException {
final SimpleJSAP jsap = new SimpleJSAP(FrontCodedStringList.class.getName(), "Builds a front-coded string list reading from standard input a newline-separated ordered list of strings.",
new Parameter[] {
new FlaggedOption("bufferSize", IntSizeStringParser.getParser(), "64Ki", JSAP.NOT_REQUIRED, 'b', "buffer-size", "The size of the I/O buffer used to read strings."),
new FlaggedOption("encoding", ForNameStringParser.getParser(Charset.class), "UTF-8", JSAP.NOT_REQUIRED, 'e', "encoding", "The file encoding."),
new FlaggedOption("ratio", IntSizeStringParser.getParser(), "4", JSAP.NOT_REQUIRED, 'r', "ratio", "The compression ratio."),
new Switch("utf8", 'u', "utf8", "Store the strings as UTF-8 byte arrays."),
new Switch("zipped", 'z', "zipped", "The string list is compressed in gzip format."),
new UnflaggedOption("frontCodedList", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the serialised front-coded list.")
});
final JSAPResult jsapResult = jsap.parse(arg);
if (jsap.messagePrinted()) return;
final int bufferSize = jsapResult.getInt("bufferSize");
final int ratio = jsapResult.getInt("ratio");
final boolean utf8 = jsapResult.getBoolean("utf8");
final boolean zipped = jsapResult.getBoolean("zipped");
final String listName = jsapResult.getString("frontCodedList");
final Charset encoding = (Charset)jsapResult.getObject("encoding");
final Logger logger = LoggerFactory.getLogger(FrontCodedStringList.class);
final ProgressLogger pl = new ProgressLogger(logger);
pl.displayFreeMemory = true;
pl.displayLocalSpeed = true;
pl.itemsName = "strings";
pl.start("Reading strings...");
final FrontCodedStringList frontCodedStringList = new FrontCodedStringList(new LineIterator(new FastBufferedReader(
new InputStreamReader(zipped ? new GZIPInputStream(System.in) : System.in, encoding), bufferSize), pl), ratio, utf8);
pl.done();
logger.info("Writing front-coded list to file...");
BinIO.storeObject(frontCodedStringList, listName);
logger.info("Completed.");
}
}