it.unimi.dsi.util.FrontCodedStringList Maven / Gradle / Ivy
Show all versions of dsi-utils Show documentation
package it.unimi.dsi.util;
/*
* DSI utilities
*
* Copyright (C) 2002-2009 Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 2.1 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
*/
import it.unimi.dsi.fastutil.bytes.ByteArrayFrontCodedList;
import it.unimi.dsi.fastutil.chars.CharArrayFrontCodedList;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.objects.AbstractObjectIterator;
import it.unimi.dsi.fastutil.objects.AbstractObjectList;
import it.unimi.dsi.fastutil.objects.AbstractObjectListIterator;
import it.unimi.dsi.fastutil.objects.ObjectListIterator;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.LineIterator;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.logging.ProgressLogger;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.nio.charset.Charset;
import java.util.Collection;
import java.util.Iterator;
import java.util.zip.GZIPInputStream;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import com.martiansoftware.jsap.stringparsers.ForNameStringParser;
import com.martiansoftware.jsap.stringparsers.IntSizeStringParser;
/** Compact storage of strings using front-coding compression.
*
* This class stores a list of strings using front-coding compression (of course,
* the compression will be reasonable only if the list is sorted, but you could
* also use instances of this class just as a handy way to manage a large
* amount of strings). It implements an immutable {@link
* it.unimi.dsi.fastutil.objects.ObjectList} that returns the i-th
* string (as a {@link MutableString}) when the {@link #get(int)} method is
* called with argument i. The returned mutable string may be freely
* modified.
*
*
As a commodity, this class provides a main method that reads from
* standard input a sequence of newline-separated words, and
* writes a corresponding serialized front-coded string list.
*
*
Implementation Details
*
* To store the list of strings, we use either a UTF-8 coded {@link
* ByteArrayFrontCodedList}, or a {@link CharArrayFrontCodedList}, depending on
* the value of the {@link #FrontCodedStringList(Iterator,int,boolean)
* utf8} parameter at creation time. In the first case, if the
* strings are ASCII-oriented the resulting array will be much smaller, but
* access times will increase manifold, as each string must be UTF-8 encoded
* before being returned.
*/
public class FrontCodedStringList extends AbstractObjectList implements Serializable {
public static final long serialVersionUID = 1;
/** The underlying {@link ByteArrayFrontCodedList}, or null
.*/
protected final ByteArrayFrontCodedList byteFrontCodedList;
/** The underlying {@link CharArrayFrontCodedList}, or null
.*/
protected final CharArrayFrontCodedList charFrontCodedList;
/** Whether this front-coded list is UTF-8 encoded. */
protected final boolean utf8;
/** Creates a new front-coded string list containing the character sequences returned by the given iterator.
*
* @param words an iterator returning {@linkplain CharSequence character sequences}.
* @param ratio the desired ratio.
* @param utf8 if true, the strings will be stored as UTF-8 byte arrays.
*/
public FrontCodedStringList( final Iterator extends CharSequence> words, final int ratio, final boolean utf8 ) {
this.utf8 = utf8;
if ( utf8 ) {
byteFrontCodedList = new ByteArrayFrontCodedList(
new AbstractObjectIterator() {
public boolean hasNext() { return words.hasNext(); }
public byte[] next() {
try {
return words.next().toString().getBytes( "UTF-8" );
}
catch( java.io.UnsupportedEncodingException cantHappen ) { return null; }
}
},
ratio );
charFrontCodedList = null;
}
else {
charFrontCodedList = new CharArrayFrontCodedList(
new AbstractObjectIterator() {
public boolean hasNext() { return words.hasNext(); }
public char[] next() {
CharSequence s = words.next();
int i = s.length();
final char[] a = new char[ i ];
while( i-- != 0 ) a[ i ] = s.charAt( i );
return a;
}
},
ratio );
byteFrontCodedList = null;
}
}
/** Creates a new front-coded string list containing the character sequences contained in the given collection.
*
* @param c a collection containing {@linkplain CharSequence character sequences}.
* @param ratio the desired ratio.
* @param utf8 if true, the strings will be stored as UTF-8 byte arrays.
*/
public FrontCodedStringList( final Collection extends CharSequence> c, final int ratio, final boolean utf8 ) {
this( c.iterator(), ratio, utf8 );
}
/** Returns whether this front-coded string list is storing its strings as UTF-8 encoded bytes.
*
* @return true if this front-coded string list is keeping its data as an array of UTF-8 encoded bytes.
*/
public boolean utf8() {
return utf8;
}
/** Returns the ratio of the underlying front-coded list.
*
* @return the ratio of the underlying front-coded list.
*/
public int ratio() {
return utf8 ? byteFrontCodedList.ratio() : charFrontCodedList.ratio();
}
/** Returns the element at the specified position in this front-coded as a mutable string.
*
* @param index an index in the list.
* @return a {@link MutableString} that will contain the string at the specified position. The string may be freely modified.
*/
public MutableString get( final int index ) {
return MutableString.wrap( utf8
? byte2Char( byteFrontCodedList.getArray( index ), null )
: charFrontCodedList.getArray( index ) );
}
/** Returns the element at the specified position in this front-coded list by storing it in a mutable string.
*
* @param index an index in the list.
* @param s a mutable string that will contain the string at the specified position.
*/
public void get( final int index, MutableString s ) {
if ( utf8 ) {
final byte[] a = byteFrontCodedList.getArray( index );
s.length( countUTF8Chars( a ) );
byte2Char( a, s.array() );
}
else {
int res = charFrontCodedList.get( index, s.array() );
if ( res < 0 ) {
s.length( s.array().length - res );
res = charFrontCodedList.get( index, s.array() );
}
else s.length( res );
}
}
/* The following methods are highly optimized UTF-8 converters exploiting
the fact that since it was ourselves in the first place who created the
coding, we can be sure it is correct. */
protected static int countUTF8Chars( final byte[] a ) {
final int length = a.length;
int result = 0, b;
for( int i = 0; i < length; i++ ) {
b = ( a[ i ] & 0xFF ) >> 4;
if ( b < 8 ) result++;
else if ( b < 14 ) {
result++;
i++;
}
else {
result++;
i += 2;
}
}
return result;
}
protected static char[] byte2Char( final byte[] a, char[] s ) {
final int length = a.length;
if ( s == null ) s = new char[ countUTF8Chars( a ) ];
int b, c, d;
for( int i = 0, j = 0; i < length; i++ ) {
b = a[ i ] & 0xFF;
if ( ( b >> 4 ) < 8 ) s[ j++ ] = (char)b;
else if ( ( b >> 4 ) < 14 ) {
c = a[ ++i ] & 0xFF;
if ( ( c & 0xC0 ) != 0x80 ) throw new IllegalStateException( "Malformed internal UTF-8 encoding" );
s[ j++ ] = (char)( ( ( b & 0x1F) << 6 ) | ( c & 0x3F ) );
}
else {
c = a[ ++i ] & 0xFF;
d = a[ ++i ];
if ( ( c & 0xC0 ) != 0x80 || ( d & 0xC0 ) != 0x80 ) throw new IllegalStateException( "Malformed internal UTF-8 encoding" );
s[ j++ ] = (char)( ( ( b & 0x0F ) << 12 ) | ( ( c & 0x3F ) << 6 ) | ( ( d & 0x3F ) << 0 ) );
}
}
return s;
}
public ObjectListIterator listIterator( final int k ) { return new AbstractObjectListIterator() {
ObjectListIterator> i = utf8 ? byteFrontCodedList.listIterator( k ) : charFrontCodedList.listIterator( k );
public boolean hasNext() { return i.hasNext(); }
public boolean hasPrevious() { return i.hasPrevious(); }
public MutableString next() { return MutableString.wrap( utf8 ? byte2Char( (byte[])i.next(), null ) : (char[])i.next() ); }
public MutableString previous() { return MutableString.wrap( utf8 ? byte2Char( (byte[])i.next(), null ) :(char[])i.previous() ); }
public int nextIndex() { return i.nextIndex(); }
public int previousIndex() { return i.previousIndex(); }
};
}
public int size() { return utf8 ? byteFrontCodedList.size() : charFrontCodedList.size(); }
public static void main( final String[] arg ) throws IOException, JSAPException, NoSuchMethodException {
final SimpleJSAP jsap = new SimpleJSAP( FrontCodedStringList.class.getName(), "Builds a front-coded string list reading from standard input a newline-separated ordered list of terms.",
new Parameter[] {
new FlaggedOption( "bufferSize", IntSizeStringParser.getParser(), "64Ki", JSAP.NOT_REQUIRED, 'b', "buffer-size", "The size of the I/O buffer used to read terms." ),
new FlaggedOption( "encoding", ForNameStringParser.getParser( Charset.class ), "UTF-8", JSAP.NOT_REQUIRED, 'e', "encoding", "The term file encoding." ),
new FlaggedOption( "ratio", IntSizeStringParser.getParser(), "4", JSAP.NOT_REQUIRED, 'r', "ratio", "The compression ratio." ),
new Switch( "utf8", 'u', "utf8", "Store the strings as UTF-8 byte arrays." ),
new Switch( "zipped", 'z', "zipped", "The term list is compressed in gzip format." ),
new UnflaggedOption( "frontCodedList", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the serialised front-coded list." )
});
JSAPResult jsapResult = jsap.parse( arg );
if ( jsap.messagePrinted() ) return;
final int bufferSize = jsapResult.getInt( "bufferSize" );
final int ratio = jsapResult.getInt( "ratio" );
final boolean utf8 = jsapResult.getBoolean( "utf8" );
final boolean zipped = jsapResult.getBoolean( "zipped" );
final String listName = jsapResult.getString( "frontCodedList" );
final Charset encoding = (Charset)jsapResult.getObject( "encoding" );
final ProgressLogger pl = new ProgressLogger();
pl.itemsName = "words";
pl.start( "Reading words..." );
final FrontCodedStringList frontCodedStringList = new FrontCodedStringList( new LineIterator( new FastBufferedReader(
new InputStreamReader( zipped ? new GZIPInputStream( System.in ) : System.in, encoding ), bufferSize ), pl ), ratio, utf8 );
pl.done();
System.err.print( "Writing to file..." );
BinIO.storeObject( frontCodedStringList, listName );
System.err.println( " done." );
}
}