
src.it.unimi.dsi.util.PermutedFrontCodedStringList Maven / Gradle / Ivy
Show all versions of dsiutils Show documentation
package it.unimi.dsi.util;
/*
* DSI utilities
*
* Copyright (C) 2002-2016 Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see .
*
*/
import it.unimi.dsi.Util;
import it.unimi.dsi.fastutil.ints.IntIterators;
import it.unimi.dsi.fastutil.ints.IntListIterator;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.TextIO;
import it.unimi.dsi.fastutil.objects.AbstractObjectList;
import it.unimi.dsi.fastutil.objects.AbstractObjectListIterator;
import it.unimi.dsi.fastutil.objects.ObjectListIterator;
import it.unimi.dsi.lang.MutableString;
import java.io.IOException;
import java.io.Serializable;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
/** A {@link it.unimi.dsi.util.FrontCodedStringList} whose indices are permuted.
*
* It may happen that a list of strings compresses very well
* using front coding, but unfortunately alphabetical order is not
* the right order for the strings in the list. Instances of this class
* wrap an instance of {@link it.unimi.dsi.util.FrontCodedStringList}
* together with a permutation π: inquiries with index i will
* actually return the string with index πi.
*
*
In case you start from a newline-delimited non-sorted list of
* UTF-8 strings, the simplest way to build
* an instance of this map is obtaining a front-coded string list and
* a permutation with a simple UN*X pipe (which also avoids storing the sorted strings):
*
* nl -v0 -nln | sort -k2 | tee >(cut -f1 >perm.txt) \
* | cut -f2 | java it.unimi.dsi.util.FrontCodedStringList tmp-lex.fcl
*
* The above command will read a list of strings from standard input,
* output a their sorted index list in perm.txt
and create a tmp-lex.fcl
front-coded
* string list containing the sorted list of strings.
*
* Important: you must be sure to be using the byte-by-byte collation order—in UN*X,
* be sure that LC_COLLATE=C
. Failure to do so will result in an order-of-magnitude-slower sorting and
* worse compression.
*
*
Now, in perm.txt
you will find the permutation that you have to pass to
* this class (given that you will use the option -i
). So the last step is just
*
* java it.unimi.dsi.util.PermutedFrontCodedStringList -i -t tmp-lex.fcl perm.txt your.fcl
*
*/
public class PermutedFrontCodedStringList extends AbstractObjectList implements Serializable {
public static final long serialVersionUID = -7046029254386353130L;
/** The underlying front-coded string list. */
final protected FrontCodedStringList frontCodedStringList;
/** The permutation. */
final protected int[] permutation;
/** Creates a new permuted front-coded string list using a given front-coded string list and permutation.
*
* @param frontCodedStringList the underlying front-coded string list.
* @param permutation the underlying permutation.
*/
public PermutedFrontCodedStringList( final FrontCodedStringList frontCodedStringList, final int[] permutation ) {
if ( frontCodedStringList.size() != permutation.length ) throw new IllegalArgumentException( "The front-coded string list contains " + frontCodedStringList.size() + " strings, but the permutation is on " + permutation.length + " elements." );
this.frontCodedStringList = frontCodedStringList;
this.permutation = permutation;
}
public CharSequence get( final int index ) {
return frontCodedStringList.get( permutation[ index ] );
}
/** Returns the element at the specified position in this front-coded list by storing it in a mutable string.
*
* @param index an index in the list.
* @param s a mutable string that will contain the string at the specified position.
*/
public void get( final int index, final MutableString s ) {
frontCodedStringList.get( permutation[ index ], s );
}
public int size() {
return frontCodedStringList.size();
}
public ObjectListIterator listIterator( final int k ) { return new AbstractObjectListIterator() {
final IntListIterator i = IntIterators.fromTo( 0, frontCodedStringList.size() );
public boolean hasNext() { return i.hasNext(); }
public boolean hasPrevious() { return i.hasPrevious(); }
public CharSequence next() { return frontCodedStringList.get( permutation[ i.nextInt() ] ); }
public CharSequence previous() { return frontCodedStringList.get( permutation[ i.previousInt() ] ); }
public int nextIndex() { return i.nextIndex(); }
public int previousIndex() { return i.previousIndex(); }
};
}
public static void main( final String[] arg ) throws IOException, ClassNotFoundException, JSAPException {
SimpleJSAP jsap = new SimpleJSAP( PermutedFrontCodedStringList.class.getName(), "Builds a permuted front-coded list of strings using a given front-coded string list and a permutation (either in text or binary format).",
new Parameter[] {
new Switch( "invert", 'i', "invert", "Invert permutation before creating the permuted list." ),
new Switch( "text", 't', "text", "The permutation is a text file." ),
new UnflaggedOption( "list", JSAP.STRING_PARSER, JSAP.REQUIRED, "A front-coded string list." ),
new UnflaggedOption( "permutation", JSAP.STRING_PARSER, JSAP.REQUIRED, "A permutation for the indices of the list (in DataInput format, unless you specify --text)." ),
new UnflaggedOption( "permutedList", JSAP.STRING_PARSER, JSAP.REQUIRED, "A the filename for the resulting permuted list." ),
} );
JSAPResult jsapResult = jsap.parse( arg );
if ( jsap.messagePrinted() ) return;
final String permutationFile = jsapResult.getString( "permutation" );
final int[] permutation = jsapResult.userSpecified( "text" )
? IntIterators.unwrap( TextIO.asIntIterator( permutationFile ) )
: BinIO.loadInts( permutationFile );
if ( jsapResult.getBoolean( "invert" ) ) Util.invertPermutationInPlace( permutation );
BinIO.storeObject(
new PermutedFrontCodedStringList( (FrontCodedStringList)BinIO.loadObject( jsapResult.getString( "list" ) ), permutation ),
jsapResult.getString( "permutedList" )
);
}
}