![JAR search and dependency download from the Maven repository](/logo.png)
src.it.unimi.di.archive4j.tool.MergePreprocessedData Maven / Gradle / Ivy
Show all versions of archive4j Show documentation
package it.unimi.di.archive4j.tool;
/*
* Copyright (C) 2008-2013 Alessio Orlandi and Sebastiano Vigna
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
*/
import it.unimi.di.big.mg4j.index.TermProcessor;
import it.unimi.di.big.mg4j.util.MG4JClassParser;
import it.unimi.dsi.Util;
import it.unimi.dsi.bits.TransformationStrategies;
import it.unimi.dsi.fastutil.Hash;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.fastutil.longs.LongIterator;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.ObjectHeapSemiIndirectPriorityQueue;
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
import it.unimi.dsi.io.FileLinesCollection;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.lang.ObjectParser;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.sux4j.mph.LcpMonotoneMinimalPerfectHashFunction;
import it.unimi.dsi.sux4j.util.EliasFanoMonotoneLongBigList;
import it.unimi.dsi.util.Properties;
import it.unimi.dsi.util.ShiftAddXorSignedStringMap;
import it.unimi.dsi.util.StringMap;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import org.apache.commons.configuration.ConfigurationException;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
/** Filters and merges term data (term lists, frequencies, global counts) originated by one or more {@linkplain Preprocess preprocessing phases}
* and generates the corresponding {@link StringMap}.
*
* After {@link Preprocess} has performed the first pass over a collection, this class {@linkplain TermFilter filters} the collected terms
* and merges the resulting data. Additionally, it generates the following files:
*
*
*
* - basename.termmap
*
*
- For each filtered term, its position in the term list.
*
*
- basename.embed
*
*
- The embedding list for this archive. It maps every term to
* its position in the non-filtered term list. It is useful to map
* terms of the archive in terms of an index over the same archive.
*
*
*
* Strategies to remove terms are provided by {@link TermFilter} implementations.
*
* @author Alessio Orlandi
* @author Sebastiano Vigna
*/
public class MergePreprocessedData {
/** The extension of the map for (filtered) terms. */
public static final String TERMMAP_EXTENSION = ".termmap";
/** The extension of the embedding list. */
public static final String EMBED_EXTENSION = ".embed";
/**
* Configuration keys that are used also by {@link Scan}.
*/
public static enum PropertyKeys {
/** The number of terms. */
TERMS
}
/**
* Interface used to specify whether a term must be filtered (that is, eliminated from the archive) or not.
*
* @author Alessio Orlandi
*/
public static interface TermFilter {
/** Returns true if the given term must be accepted.
*
* @param term the term.
* @param frequency the frequency of term
.
*/
public boolean accept( final MutableString term, final int frequency );
}
/** A filter that eliminates mixed digit-nondigit terms. */
public static class MixedFilter implements TermFilter {
public boolean accept( final MutableString word, final int frequency ) {
final char[] wordChars = word.array();
boolean hasDigits = false, hasNonDigits = false, keepLooping = true;
for ( int i = word.length(); keepLooping && i-- != 0; ) {
if ( Character.isDigit( wordChars[ i ] ) ) hasDigits = true;
else hasNonDigits = true;
keepLooping = hasDigits ^ hasNonDigits;
}
return keepLooping;
}
}
/** A filter that eliminates too long or too short terms. */
public static class LengthFilter implements TermFilter {
final int minLen, maxLen;
/** Creates a new length filter.
*
* @param minLen the minimum length of an accepted term (inclusive).
* @param maxLen the maximum length of an accepted term (exclusive).
*/
public LengthFilter( final int minLen, final int maxLen ) {
this.minLen = minLen;
this.maxLen = maxLen;
}
public boolean accept( final MutableString word, final int frequency ) {
final int wlen = word.length();
return wlen >= minLen && wlen < maxLen;
}
}
/** Filter that eliminates terms in a given set. */
public static class StopwordFilter implements TermFilter {
private final ObjectOpenHashSet terms;
/** Creates a new filter using a given iterable object of terms.
*
* @param terms an iterable object; the returned terms will be copied into
* {@link MutableString} instances.
*/
public StopwordFilter( final Iterable extends CharSequence> terms ) {
this.terms = new ObjectOpenHashSet( Hash.DEFAULT_INITIAL_SIZE, Hash.VERY_FAST_LOAD_FACTOR );
for( CharSequence s : terms ) this.terms.add( new MutableString( s ) );
this.terms.trim();
}
public boolean accept( final MutableString word, final int frequency ) {
return ! terms.contains( word );
}
}
/** Filter that eliminates terms that are too much or not enough frequent. */
public static class FrequencyFilter implements TermFilter {
final int minFreq, maxFreq;
/** Creates a new frequency filter.
*
* @param minFreq the minimum frequency of an accepted term (inclusive).
* @param maxFreq the maximum frequency of an accepted term (exclusive).
*/
public FrequencyFilter( final int minFreq, final int maxFreq ) {
this.minFreq = minFreq;
this.maxFreq = maxFreq;
}
public boolean accept( final MutableString term, final int frequency ) {
return frequency >= minFreq && (frequency < maxFreq || maxFreq == -1 ) ;
}
}
private static InputBitStream[] openFrequencyFiles( final CharSequence[] inputNames ) throws IOException {
final InputBitStream[] streams = new InputBitStream[ inputNames.length ];
for ( int i = 0; i < inputNames.length; i++ )
streams[ i ] = new InputBitStream( inputNames[ i ] + Preprocess.FREQUENCIES_EXTENSION );
return streams;
}
private static LongIterator[] openCountFiles( final CharSequence[] inputNames ) throws IOException {
final LongIterator[] iterators = new LongIterator[ inputNames.length ];
for ( int i = 0; i < inputNames.length; i++ )
iterators[ i ] = BinIO.asLongIterator( inputNames[ i ] + Preprocess.COUNTS_EXTENSION );
return iterators;
}
private static FileLinesCollection.FileLinesIterator[] openTermFiles( final CharSequence[] inputNames ) {
final FileLinesCollection.FileLinesIterator[] files = new FileLinesCollection.FileLinesIterator[ inputNames.length ];
for ( int i = 0; i < inputNames.length; i++ )
files[ i ] = new FileLinesCollection( inputNames[ i ] + Preprocess.TERMS_EXTENSION, "UTF-8" ).iterator();
return files;
}
/** Runs the merge process.
*
* @param inputBasename the basename of a previous {@link Preprocess} runs.
* @param outputBasename the output basename.
* @param filters term filters that will be used to choose which term to include in the
* merged data.
*/
public static void run( final String inputBasename, final String outputBasename, final TermFilter[] filters ) throws IOException, ConfigurationException, IllegalArgumentException, ClassNotFoundException, IllegalAccessException, InvocationTargetException, InstantiationException, NoSuchMethodException {
run( new String[] { inputBasename }, outputBasename, filters );
}
/** Runs the merge process.
*
* @param inputBasename the basenames of one or more previous {@link Preprocess} runs.
* @param outputBasename the output basename.
* @param filters term filters that will be used to choose which term to include in the
* merged data.
*/
public static void run( final String inputBasename[], final String outputBasename, final TermFilter[] filters ) throws IOException, ConfigurationException, IllegalArgumentException, ClassNotFoundException, IllegalAccessException, InvocationTargetException, InstantiationException, NoSuchMethodException {
ArrayList inputNames = new ArrayList();
TermProcessor firstProcessor = null;
String firstProcessorSpec = null;
String firstField = null;
int documents = 0;
for( int i = 0; i < inputBasename.length; i++ ) {
Properties properties = new Properties( inputBasename[ i ] + Preprocess.PROPERTIES_EXTENSION );
if ( i == 0 ){
firstProcessor = ObjectParser.fromSpec( firstProcessorSpec = properties.getString( Preprocess.PropertyKeys.TERMPROCESSOR ), TermProcessor.class, MG4JClassParser.PACKAGE, new String[] { "getInstance" } );
firstField = properties.getString( Preprocess.PropertyKeys.FIELD );
}
else {
if ( firstProcessor == null || ! firstProcessor.equals( ObjectParser.fromSpec( new Properties( inputBasename[ 1 ] + Preprocess.PROPERTIES_EXTENSION ).getString( Preprocess.PropertyKeys.TERMPROCESSOR ), TermProcessor.class, MG4JClassParser.PACKAGE, new String[] { "getInstance" } ) ) )
throw new IllegalArgumentException( "Preprocessed data uses inconsistent term processing" );
if ( firstField == null || ! firstField.equals( properties.getString( Preprocess.PropertyKeys.FIELD ) ) )
throw new IllegalArgumentException( "Preprocessed data uses inconsistent fieldS" );
}
documents += properties.getInt( Preprocess.PropertyKeys.DOCUMENTS );
int numBatches = properties.getInt( Preprocess.PropertyKeys.BATCHES );
while( numBatches-- != 0 ) inputNames.add( Preprocess.batchName( inputBasename[ i ], numBatches ) );
}
Properties properties = new Properties();
properties.setProperty( Preprocess.PropertyKeys.TERMPROCESSOR, firstProcessorSpec );
properties.setProperty( Preprocess.PropertyKeys.FIELD, firstField );
properties.setProperty( Preprocess.PropertyKeys.DOCUMENTS, documents );
properties.setFileName( outputBasename + Preprocess.PROPERTIES_EXTENSION );
run( inputNames.toArray( new String[ 0 ] ), outputBasename, filters, properties );
}
/**
* Runs the merge process.
*
* @param inputNames the basenames for all sets of term lists and frequency files to merge.
* @param outputBasename the output basename.
* @param properties an initialised property object containing additional properties to be
* saved (usually, at least {@link Preprocess.PropertyKeys#TERMPROCESSOR} and {@link Preprocess.PropertyKeys#FIELD}).
* @param filters term filters that will be used to choose which term to include in the
* merged data.
*
*/
public static void run( final CharSequence[] inputNames, final String outputBasename, final TermFilter[] filters, final Properties properties ) throws IOException, ConfigurationException {
final OutputBitStream outputFreq = new OutputBitStream( outputBasename + Preprocess.FREQUENCIES_EXTENSION );
final DataOutputStream outputCounts = new DataOutputStream ( new FastBufferedOutputStream ( new FileOutputStream ( outputBasename + Preprocess.COUNTS_EXTENSION ) ) ) ;
final PrintWriter outputTerms = new PrintWriter( new OutputStreamWriter( new FileOutputStream( outputBasename + Preprocess.TERMS_EXTENSION ), "UTF-8" ) );
InputBitStream[] inputFreqs = openFrequencyFiles( inputNames );
LongIterator[] inputCounts = openCountFiles( inputNames );
FileLinesCollection.FileLinesIterator[] inputTerms = openTermFiles( inputNames );
MutableString[] pqTerms = new MutableString[ inputFreqs.length ];
int[] pqFrequencies = new int[ inputFreqs.length ];
long[] pqCounts = new long[ inputFreqs.length ];
final ProgressLogger pl = new ProgressLogger();
ObjectHeapSemiIndirectPriorityQueue heap = new ObjectHeapSemiIndirectPriorityQueue( pqTerms );
for ( int i = 0; i < inputFreqs.length; i++ )
if ( inputTerms[ i ].hasNext() ) {
pqTerms[ i ] = inputTerms[ i ].next();
pqFrequencies[ i ] = inputFreqs[ i ].readGamma();
pqCounts[ i ] = inputCounts[ i ].nextLong();
heap.enqueue( i );
}
pl.itemsName = "terms";
pl.start( "Merging and discarding terms.." );
final int numFilters = filters.length;
int numTerms = 0, numFilteredTerms = 0, freq = 0, count = 0, index;
File filteredTermsFile = File.createTempFile( MergePreprocessedData.class.getName(), "filtered" );
filteredTermsFile.deleteOnExit();
DataOutputStream filteredTerms = new DataOutputStream( new FastBufferedOutputStream( new FileOutputStream( filteredTermsFile ) ) );
while ( !heap.isEmpty() ) {
final MutableString current = pqTerms[ index = heap.first() ].copy();
freq = 0;
count = 0;
// Recover all in-file and among-file duplicates.
while ( !heap.isEmpty() && pqTerms[ index = heap.first() ].equals( current ) ) {
freq += pqFrequencies[ index ];
count += pqCounts[ index ];
if ( inputTerms[ index ].hasNext() ) {
pqTerms[ index ] = inputTerms[ index ].next();
pqCounts[ index ] = inputCounts[ index ].nextLong();
pqFrequencies[ index ] = inputFreqs[ index ].readGamma();
heap.changed();
}
else heap.dequeue();
}
int j;
for ( j = numFilters; j-- != 0; ) if ( ! filters[ j ].accept( current, freq ) ) break;
if ( j == -1 ) {
outputTerms.println( current );
outputFreq.writeGamma( freq );
outputCounts.writeLong ( count );
filteredTerms.writeInt( numTerms );
numFilteredTerms++;
}
numTerms++;
pl.lightUpdate();
}
pl.done();
pl.logger.info( "Filtered " + numFilteredTerms + " out of " + pl.count + " (" + Util.format( ( numFilteredTerms * 100.0 ) / pl.count ) + "%)" );
for ( InputBitStream i : inputFreqs ) i.close();
filteredTerms.close();
outputFreq.close();
outputTerms.close();
outputCounts.close();
BinIO.storeObject( new EliasFanoMonotoneLongBigList( numFilteredTerms, pl.count, BinIO.asIntIterator( filteredTermsFile ) ), outputBasename + EMBED_EXTENSION );
filteredTermsFile.delete();
properties.setProperty( PropertyKeys.TERMS, numFilteredTerms );
properties.save();
pl.logger.info( "Generating term map..." );
final FileLinesCollection outputCollection = new FileLinesCollection( outputBasename + Preprocess.TERMS_EXTENSION, "UTF-8" );
final StringMap smap = new ShiftAddXorSignedStringMap( outputCollection.iterator(), new LcpMonotoneMinimalPerfectHashFunction( outputCollection, TransformationStrategies.prefixFreeUtf16() ) );
BinIO.storeObject( smap, outputBasename + TERMMAP_EXTENSION );
}
@SuppressWarnings("unchecked")
public static void main( String[] args ) throws Exception {
SimpleJSAP jsap = new SimpleJSAP(
MergePreprocessedData.class.getName(), "Filters and merges term data (term lists, frequencies, etc.) originated by one or more preprocessing phases and generates the corresponding string map and embedding list.",
new Parameter[] {
new Switch( "remove-mixed", 'x', "remove-mixed", "Remove mixed alphanumeric terms." ),
new FlaggedOption( "stopwords", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 's', "stopwords", "List of lowercase, termprocessed stopwords to be eliminated, one per line." ),
new FlaggedOption( "min-length", JSAP.INTEGER_PARSER, "1", JSAP.NOT_REQUIRED, JSAP.NO_SHORTFLAG, "min-length", "Minimum length of a term." ),
new FlaggedOption( "max-length", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, JSAP.NO_SHORTFLAG, "max-length", "Maximum length of a term." ),
new FlaggedOption( "min-freq", JSAP.INTEGER_PARSER, "0", JSAP.NOT_REQUIRED, 'm', "min-freq", "Minimum frequency of a term." ),
new FlaggedOption( "max-freq", JSAP.INTEGER_PARSER, "-1", JSAP.NOT_REQUIRED, 'M', "max-freq", "Maximum frequency of a term." ),
new UnflaggedOption( "output", JSAP.STRING_PARSER, JSAP.REQUIRED, "Output basename." ),
new UnflaggedOption( "input", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, JSAP.GREEDY, "Input basenames." ) } );
JSAPResult jsapResult = jsap.parse( args );
if ( jsap.messagePrinted() ) return;
ObjectArrayList filters = new ObjectArrayList();
if ( jsapResult.userSpecified( "stopwords" ) ) filters.add( new StopwordFilter( new FileLinesCollection( jsapResult.getString( "stopwords" ), "UTF-8" ).allLines() ) );
if ( jsapResult.getBoolean( "remove-mixed" ) ) filters.add( new MixedFilter() );
if ( jsapResult.userSpecified( "min-freq" ) || jsapResult.userSpecified( "max-freq" ) ) filters
.add( new FrequencyFilter( jsapResult.getInt( "min-freq" ), jsapResult.getInt( "max-freq" ) ) );
if ( jsapResult.userSpecified( "max-length" ) || jsapResult.userSpecified("min-length") ) filters.add( new LengthFilter( jsapResult.getInt( "min-length" ), jsapResult.getInt( "max-length" ) ) );
run( jsapResult.getStringArray( "input" ), jsapResult.getString( "output" ), filters.toArray( new TermFilter[ 0 ] ) );
}
}