All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.it.unimi.dsi.util.PermutedFrontCodedStringList Maven / Gradle / Ivy

Go to download

The DSI utilities are a mishmash of classes accumulated during the last twenty years in projects developed at the DSI (Dipartimento di Scienze dell'Informazione, i.e., Information Sciences Department), now DI (Dipartimento di Informatica, i.e., Informatics Department), of the Universita` degli Studi di Milano.

There is a newer version: 2.7.3
Show newest version
package it.unimi.dsi.util;

import java.io.IOException;
import java.io.Serializable;

import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;

/*
 * DSI utilities
 *
 * Copyright (C) 2002-2018 Sebastiano Vigna
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import it.unimi.dsi.Util;
import it.unimi.dsi.fastutil.ints.IntIterators;
import it.unimi.dsi.fastutil.ints.IntListIterator;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.TextIO;
import it.unimi.dsi.fastutil.objects.AbstractObjectList;
import it.unimi.dsi.fastutil.objects.ObjectListIterator;
import it.unimi.dsi.lang.MutableString;

/** A {@link it.unimi.dsi.util.FrontCodedStringList} whose indices are permuted.
 *
 * 

It may happen that a list of strings compresses very well * using front coding, but unfortunately alphabetical order is not * the right order for the strings in the list. Instances of this class * wrap an instance of {@link it.unimi.dsi.util.FrontCodedStringList} * together with a permutation π: inquiries with index i will * actually return the string with index πi. * *

In case you start from a newline-delimited non-sorted list of * UTF-8 strings, the simplest way to build * an instance of this map is obtaining a front-coded string list and * a permutation with a simple UN*X pipe (which also avoids storing the sorted strings): *

 * nl -v0 -nln | sort -k2 | tee >(cut -f1 >perm.txt) \
 * 	| cut -f2 | java it.unimi.dsi.util.FrontCodedStringList tmp-lex.fcl
 * 
* The above command will read a list of strings from standard input, * output a their sorted index list in perm.txt and create a tmp-lex.fcl front-coded * string list containing the sorted list of strings. * *

Important: you must be sure to be using the byte-by-byte collation order—in UN*X, * be sure that LC_COLLATE=C. Failure to do so will result in an order-of-magnitude-slower sorting and * worse compression. * *

Now, in perm.txt you will find the permutation that you have to pass to * this class (given that you will use the option -i). So the last step is just *

 * java it.unimi.dsi.util.PermutedFrontCodedStringList -i -t tmp-lex.fcl perm.txt your.fcl
 * 
*/ public class PermutedFrontCodedStringList extends AbstractObjectList implements Serializable { public static final long serialVersionUID = -7046029254386353130L; /** The underlying front-coded string list. */ final protected FrontCodedStringList frontCodedStringList; /** The permutation. */ final protected int[] permutation; /** Creates a new permuted front-coded string list using a given front-coded string list and permutation. * * @param frontCodedStringList the underlying front-coded string list. * @param permutation the underlying permutation. */ public PermutedFrontCodedStringList(final FrontCodedStringList frontCodedStringList, final int[] permutation) { if (frontCodedStringList.size() != permutation.length) throw new IllegalArgumentException("The front-coded string list contains " + frontCodedStringList.size() + " strings, but the permutation is on " + permutation.length + " elements."); this.frontCodedStringList = frontCodedStringList; this.permutation = permutation; } @Override public CharSequence get(final int index) { return frontCodedStringList.get(permutation[index]); } /** Returns the element at the specified position in this front-coded list by storing it in a mutable string. * * @param index an index in the list. * @param s a mutable string that will contain the string at the specified position. */ public void get(final int index, final MutableString s) { frontCodedStringList.get(permutation[index], s); } @Override public int size() { return frontCodedStringList.size(); } @Override public ObjectListIterator listIterator(final int k) { return new ObjectListIterator() { final IntListIterator i = IntIterators.fromTo(0, frontCodedStringList.size()); @Override public boolean hasNext() { return i.hasNext(); } @Override public boolean hasPrevious() { return i.hasPrevious(); } @Override public CharSequence next() { return frontCodedStringList.get(permutation[i.nextInt()]); } @Override public CharSequence previous() { return frontCodedStringList.get(permutation[i.previousInt()]); } @Override public int nextIndex() { return i.nextIndex(); } @Override public int previousIndex() { return i.previousIndex(); } }; } public static void main(final String[] arg) throws IOException, ClassNotFoundException, JSAPException { SimpleJSAP jsap = new SimpleJSAP(PermutedFrontCodedStringList.class.getName(), "Builds a permuted front-coded list of strings using a given front-coded string list and a permutation (either in text or binary format).", new Parameter[] { new Switch("invert", 'i', "invert", "Invert permutation before creating the permuted list."), new Switch("text", 't', "text", "The permutation is a text file."), new UnflaggedOption("list", JSAP.STRING_PARSER, JSAP.REQUIRED, "A front-coded string list."), new UnflaggedOption("permutation", JSAP.STRING_PARSER, JSAP.REQUIRED, "A permutation for the indices of the list (in DataInput format, unless you specify --text)."), new UnflaggedOption("permutedList", JSAP.STRING_PARSER, JSAP.REQUIRED, "A the filename for the resulting permuted list."), }); JSAPResult jsapResult = jsap.parse(arg); if (jsap.messagePrinted()) return; final String permutationFile = jsapResult.getString("permutation"); final int[] permutation = jsapResult.userSpecified("text") ? IntIterators.unwrap(TextIO.asIntIterator(permutationFile)) : BinIO.loadInts(permutationFile); if (jsapResult.getBoolean("invert")) Util.invertPermutationInPlace(permutation); BinIO.storeObject( new PermutedFrontCodedStringList((FrontCodedStringList)BinIO.loadObject(jsapResult.getString("list")), permutation), jsapResult.getString("permutedList") ); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy