src.it.unimi.di.archive4j.tool.MergePreprocessedData Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of archive4j Show documentation
Archive4J is a suite of tools to store compactly term/count information of a document collection.
There is a newer version: 1.3.3
package it.unimi.di.archive4j.tool;

/*
 * Copyright (C) 2008-2013 Alessio Orlandi and Sebastiano Vigna
 *
 *  This program is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU General Public License as published by the Free
 *  Software Foundation; either version 2 of the License, or (at your option)
 *  any later version.
 *
 *  This program is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
 */
import it.unimi.di.big.mg4j.index.TermProcessor;
import it.unimi.di.big.mg4j.util.MG4JClassParser;
import it.unimi.dsi.Util;
import it.unimi.dsi.bits.TransformationStrategies;
import it.unimi.dsi.fastutil.Hash;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.fastutil.longs.LongIterator;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.ObjectHeapSemiIndirectPriorityQueue;
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
import it.unimi.dsi.io.FileLinesCollection;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.lang.ObjectParser;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.sux4j.mph.LcpMonotoneMinimalPerfectHashFunction;
import it.unimi.dsi.sux4j.util.EliasFanoMonotoneLongBigList;
import it.unimi.dsi.util.Properties;
import it.unimi.dsi.util.ShiftAddXorSignedStringMap;
import it.unimi.dsi.util.StringMap;

import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;

import org.apache.commons.configuration.ConfigurationException;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;


/** Filters and merges term data (term lists, frequencies, global counts) originated by one or more {@linkplain Preprocess preprocessing phases}
 * and generates the corresponding {@link StringMap}.
 * 
 * After {@link Preprocess} has performed the first pass over a collection, this class {@linkplain TermFilter filters} the collected terms
 * and merges the resulting data. Additionally, it generates the following files:
 * 
 * 

 * 
 * basename.termmap
 * 
 * 
For each filtered term, its position in the term list.
 * 
 * 
basename.embed
 *
 * 
The embedding list for this archive. It maps every term to
 * its position in the non-filtered term list. It is useful to map
 * terms of the archive in terms of an index over the same archive.
 *
 * 
 * 
 * Strategies to remove terms are provided by {@link TermFilter} implementations.
 * 
 * @author Alessio Orlandi
 * @author Sebastiano Vigna
 */

public class MergePreprocessedData {
	
	/** The extension of the map for (filtered) terms. */ 
	public static final String TERMMAP_EXTENSION = ".termmap";

	/** The extension of the embedding list. */ 
	public static final String EMBED_EXTENSION = ".embed";

	/**
	 * Configuration keys that are used also by {@link Scan}.
	 */
	public static enum PropertyKeys {
		/** The number of terms. */
		TERMS
	}
	/**
	 * Interface used to specify whether a term must be filtered (that is, eliminated from the archive) or not.
	 * 
	 * @author Alessio Orlandi
	 */
	public static interface TermFilter {
		/** Returns true if the given term must be accepted.
		 * 
		 * @param term the term.
		 * @param frequency the frequency of term.
		 */
		public boolean accept( final MutableString term, final int frequency );
	}

	/** A filter that eliminates mixed digit-nondigit terms. */
	public static class MixedFilter implements TermFilter {
		public boolean accept( final MutableString word, final int frequency ) {
			final char[] wordChars = word.array();
			boolean hasDigits = false, hasNonDigits = false, keepLooping = true;
			for ( int i = word.length(); keepLooping && i-- != 0; ) {
				if ( Character.isDigit( wordChars[ i ] ) ) hasDigits = true;
				else hasNonDigits = true;

				keepLooping = hasDigits ^ hasNonDigits;
			}

			return keepLooping;
		}
	}

	/** A filter that eliminates too long or too short terms. */
	public static class LengthFilter implements TermFilter {
		final int minLen, maxLen;

		/** Creates a new length filter.
		 * 
		 * @param minLen the minimum length of an accepted term (inclusive).
		 * @param maxLen the maximum length of an accepted term (exclusive).
		 */
		public LengthFilter( final int minLen, final int maxLen ) {
			this.minLen = minLen;
			this.maxLen = maxLen;
		}

		public boolean accept( final MutableString word, final int frequency ) {
			final int wlen = word.length();
			return wlen >= minLen && wlen < maxLen;
		}
	}

	/** Filter that eliminates terms in a given set. */
	public static class StopwordFilter implements TermFilter {

		private final ObjectOpenHashSet terms;

		/** Creates a new filter using a given iterable object of terms.
		 * 
		 * @param terms an iterable object; the returned terms will be copied into
		 * {@link MutableString} instances.
		 */
		public StopwordFilter( final Iterable terms ) {
			this.terms = new ObjectOpenHashSet( Hash.DEFAULT_INITIAL_SIZE, Hash.VERY_FAST_LOAD_FACTOR );
			for( CharSequence s : terms ) this.terms.add( new MutableString( s ) );
			this.terms.trim();
		}

		public boolean accept( final MutableString word, final int frequency ) {
			return ! terms.contains( word );
		}
	}

	/** Filter that eliminates terms that are too much or not enough frequent. */
	public static class FrequencyFilter implements TermFilter {
		final int minFreq, maxFreq;

		/** Creates a new frequency filter.
		 * 
		 * @param minFreq the minimum frequency of an accepted term (inclusive).
		 * @param maxFreq the maximum frequency of an accepted term (exclusive).
		 */
		public FrequencyFilter( final int minFreq, final int maxFreq ) {
			this.minFreq = minFreq;
			this.maxFreq = maxFreq;
		}

		public boolean accept( final MutableString term, final int frequency ) {
			return frequency >= minFreq && (frequency < maxFreq || maxFreq == -1 ) ;
		}
	}

	private static InputBitStream[] openFrequencyFiles( final CharSequence[] inputNames ) throws IOException {
		final InputBitStream[] streams = new InputBitStream[ inputNames.length ];

		for ( int i = 0; i < inputNames.length; i++ )
			streams[ i ] = new InputBitStream( inputNames[ i ] + Preprocess.FREQUENCIES_EXTENSION );

		return streams;
	}

	private static LongIterator[] openCountFiles( final CharSequence[] inputNames ) throws IOException {
		final LongIterator[] iterators = new LongIterator[ inputNames.length ];

		for ( int i = 0; i < inputNames.length; i++ )
			iterators[ i ] = BinIO.asLongIterator( inputNames[ i ] + Preprocess.COUNTS_EXTENSION );

		return iterators;
	}

	private static FileLinesCollection.FileLinesIterator[] openTermFiles( final CharSequence[] inputNames ) {
		final FileLinesCollection.FileLinesIterator[] files = new FileLinesCollection.FileLinesIterator[ inputNames.length ];

		for ( int i = 0; i < inputNames.length; i++ )
			files[ i ] = new FileLinesCollection( inputNames[ i ] + Preprocess.TERMS_EXTENSION, "UTF-8" ).iterator();

		return files;
	}

	/** Runs the merge process.
	 * 
	 * @param inputBasename the basename of a previous {@link Preprocess} runs.
	 * @param outputBasename the output basename.
	 * @param filters term filters that will be used to choose which term to include in the 
	 * merged data.
	 */
	public static void run( final String inputBasename, final String outputBasename, final TermFilter[] filters ) throws IOException, ConfigurationException, IllegalArgumentException, ClassNotFoundException, IllegalAccessException, InvocationTargetException, InstantiationException, NoSuchMethodException {
		run( new String[] { inputBasename }, outputBasename, filters );
	}	
	
	/** Runs the merge process.
	 * 
	 * @param inputBasename the basenames of one or more previous {@link Preprocess} runs.
	 * @param outputBasename the output basename.
	 * @param filters term filters that will be used to choose which term to include in the 
	 * merged data.
	 */
	public static void run( final String inputBasename[], final String outputBasename, final TermFilter[] filters ) throws IOException, ConfigurationException, IllegalArgumentException, ClassNotFoundException, IllegalAccessException, InvocationTargetException, InstantiationException, NoSuchMethodException {
		ArrayList inputNames = new ArrayList();
		TermProcessor firstProcessor = null;
		String firstProcessorSpec = null;
		String firstField = null;
		int documents = 0;
		
		for( int i = 0; i < inputBasename.length; i++ ) { 
			Properties properties = new Properties( inputBasename[ i ] + Preprocess.PROPERTIES_EXTENSION );
			if ( i == 0 ){
				firstProcessor = ObjectParser.fromSpec( firstProcessorSpec = properties.getString( Preprocess.PropertyKeys.TERMPROCESSOR ), TermProcessor.class, MG4JClassParser.PACKAGE, new String[] { "getInstance" } );
				firstField = properties.getString( Preprocess.PropertyKeys.FIELD );
			}
			else {
				if ( firstProcessor == null || ! firstProcessor.equals( ObjectParser.fromSpec( new Properties( inputBasename[ 1 ] + Preprocess.PROPERTIES_EXTENSION ).getString( Preprocess.PropertyKeys.TERMPROCESSOR ), TermProcessor.class, MG4JClassParser.PACKAGE, new String[] { "getInstance" } ) ) )
					throw new IllegalArgumentException( "Preprocessed data uses inconsistent term processing" );
				if ( firstField == null || ! firstField.equals( properties.getString( Preprocess.PropertyKeys.FIELD ) ) ) 
					throw new IllegalArgumentException( "Preprocessed data uses inconsistent fieldS" );
			}

			documents += properties.getInt( Preprocess.PropertyKeys.DOCUMENTS );
			int numBatches = properties.getInt( Preprocess.PropertyKeys.BATCHES );
			while( numBatches-- != 0 ) inputNames.add( Preprocess.batchName( inputBasename[ i ], numBatches ) );
		}
		
		Properties properties = new Properties();
		properties.setProperty( Preprocess.PropertyKeys.TERMPROCESSOR, firstProcessorSpec );
		properties.setProperty( Preprocess.PropertyKeys.FIELD, firstField );
		properties.setProperty( Preprocess.PropertyKeys.DOCUMENTS, documents );
		properties.setFileName( outputBasename + Preprocess.PROPERTIES_EXTENSION );
		run( inputNames.toArray( new String[ 0 ] ), outputBasename, filters, properties );
	}
	/**
	 * Runs the merge process.
	 * 
	 * @param inputNames the basenames for all sets of term lists and frequency files to merge.
	 * @param outputBasename the output basename.
	 * @param properties an initialised property object containing additional properties to be
	 * saved (usually, at least {@link Preprocess.PropertyKeys#TERMPROCESSOR} and {@link Preprocess.PropertyKeys#FIELD}). 
	 * @param filters term filters that will be used to choose which term to include in the 
	 * merged data.
	 * 
	 */
	public static void run( final CharSequence[] inputNames, final String outputBasename, final TermFilter[] filters, final Properties properties ) throws IOException, ConfigurationException {

		final OutputBitStream outputFreq = new OutputBitStream( outputBasename + Preprocess.FREQUENCIES_EXTENSION );
		final DataOutputStream outputCounts = new DataOutputStream ( new FastBufferedOutputStream ( new FileOutputStream ( outputBasename + Preprocess.COUNTS_EXTENSION ) ) ) ;
		final PrintWriter outputTerms = new PrintWriter( new OutputStreamWriter( new FileOutputStream( outputBasename + Preprocess.TERMS_EXTENSION ), "UTF-8" ) );

		InputBitStream[] inputFreqs = openFrequencyFiles( inputNames );
		LongIterator[] inputCounts = openCountFiles( inputNames );
		FileLinesCollection.FileLinesIterator[] inputTerms = openTermFiles( inputNames );
		MutableString[] pqTerms = new MutableString[ inputFreqs.length ];
		int[] pqFrequencies = new int[ inputFreqs.length ];
		long[] pqCounts = new long[ inputFreqs.length ];
		final ProgressLogger pl = new ProgressLogger();

		ObjectHeapSemiIndirectPriorityQueue heap = new ObjectHeapSemiIndirectPriorityQueue( pqTerms );

		for ( int i = 0; i < inputFreqs.length; i++ )
			if ( inputTerms[ i ].hasNext() ) {
				pqTerms[ i ] = inputTerms[ i ].next();
				pqFrequencies[ i ] = inputFreqs[ i ].readGamma();
				pqCounts[ i ] = inputCounts[ i ].nextLong();
				heap.enqueue( i );
			}

		pl.itemsName = "terms";
		pl.start( "Merging and discarding terms.." );

		final int numFilters = filters.length;
		int numTerms = 0, numFilteredTerms = 0, freq = 0, count = 0, index;
		File filteredTermsFile = File.createTempFile( MergePreprocessedData.class.getName(), "filtered" );
		filteredTermsFile.deleteOnExit();
		DataOutputStream filteredTerms = new DataOutputStream( new FastBufferedOutputStream( new FileOutputStream( filteredTermsFile ) ) );
		
		while ( !heap.isEmpty() ) {

			final MutableString current = pqTerms[ index = heap.first() ].copy();
			freq = 0;
			count = 0;

			// Recover all in-file and among-file duplicates.
			while ( !heap.isEmpty() && pqTerms[ index = heap.first() ].equals( current ) ) {
				freq += pqFrequencies[ index ];
				count += pqCounts[ index ];

				if ( inputTerms[ index ].hasNext() ) {
					pqTerms[ index ] = inputTerms[ index ].next();
					pqCounts[ index ] = inputCounts[ index ].nextLong();
					pqFrequencies[ index ] = inputFreqs[ index ].readGamma();
					heap.changed();
				}
				else heap.dequeue();
			}

			int j;
			for ( j = numFilters; j-- != 0; ) if ( ! filters[ j ].accept( current, freq ) ) break;

			if ( j == -1 ) {
				outputTerms.println( current );
				outputFreq.writeGamma( freq );
				outputCounts.writeLong ( count );
				filteredTerms.writeInt( numTerms );
				numFilteredTerms++;
			}

			numTerms++;
			pl.lightUpdate();
		}

		pl.done();
		pl.logger.info( "Filtered " + numFilteredTerms + " out of " + pl.count + " (" + Util.format( ( numFilteredTerms * 100.0 ) / pl.count ) + "%)" );

		for ( InputBitStream i : inputFreqs ) i.close();

		filteredTerms.close();
		outputFreq.close();
		outputTerms.close();
		outputCounts.close();
		
		BinIO.storeObject( new EliasFanoMonotoneLongBigList( numFilteredTerms, pl.count, BinIO.asIntIterator( filteredTermsFile ) ), outputBasename + EMBED_EXTENSION );
		filteredTermsFile.delete();

		properties.setProperty( PropertyKeys.TERMS, numFilteredTerms );
		properties.save();

		pl.logger.info( "Generating term map..." );

		final FileLinesCollection outputCollection = new FileLinesCollection( outputBasename + Preprocess.TERMS_EXTENSION, "UTF-8" );
		final StringMap smap = new ShiftAddXorSignedStringMap( outputCollection.iterator(), new LcpMonotoneMinimalPerfectHashFunction( outputCollection, TransformationStrategies.prefixFreeUtf16() ) );

		BinIO.storeObject( smap, outputBasename + TERMMAP_EXTENSION );
	}

	
	@SuppressWarnings("unchecked")
	public static void main( String[] args ) throws Exception {
		SimpleJSAP jsap = new SimpleJSAP(
				MergePreprocessedData.class.getName(), "Filters and merges term data (term lists, frequencies, etc.) originated by one or more preprocessing phases and generates the corresponding string map and embedding list.",
				new Parameter[] {
						new Switch( "remove-mixed", 'x', "remove-mixed", "Remove mixed alphanumeric terms." ),
						new FlaggedOption( "stopwords", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 's', "stopwords", "List of lowercase, termprocessed stopwords to be eliminated, one per line." ),
						new FlaggedOption( "min-length", JSAP.INTEGER_PARSER, "1", JSAP.NOT_REQUIRED, JSAP.NO_SHORTFLAG, "min-length", "Minimum length of a term." ),
						new FlaggedOption( "max-length", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, JSAP.NO_SHORTFLAG, "max-length", "Maximum length of a term." ),
						new FlaggedOption( "min-freq", JSAP.INTEGER_PARSER, "0", JSAP.NOT_REQUIRED, 'm', "min-freq", "Minimum frequency of a term." ),
						new FlaggedOption( "max-freq", JSAP.INTEGER_PARSER, "-1", JSAP.NOT_REQUIRED, 'M', "max-freq", "Maximum frequency of a term." ),
						new UnflaggedOption( "output", JSAP.STRING_PARSER, JSAP.REQUIRED, "Output basename." ),
						new UnflaggedOption( "input", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, JSAP.GREEDY, "Input basenames." ) } );

		JSAPResult jsapResult = jsap.parse( args );
		if ( jsap.messagePrinted() ) return;
	
		ObjectArrayList filters = new ObjectArrayList();

		if ( jsapResult.userSpecified( "stopwords" ) ) filters.add( new StopwordFilter( new FileLinesCollection( jsapResult.getString( "stopwords" ), "UTF-8" ).allLines() ) );

		if ( jsapResult.getBoolean( "remove-mixed" ) ) filters.add( new MixedFilter() );

		if ( jsapResult.userSpecified( "min-freq" ) || jsapResult.userSpecified( "max-freq" ) ) filters
				.add( new FrequencyFilter( jsapResult.getInt( "min-freq" ), jsapResult.getInt( "max-freq" ) ) );

		if ( jsapResult.userSpecified( "max-length" ) || jsapResult.userSpecified("min-length") ) filters.add( new LengthFilter( jsapResult.getInt( "min-length" ), jsapResult.getInt( "max-length" ) ) );

		run( jsapResult.getStringArray( "input" ), jsapResult.getString( "output" ), filters.toArray( new TermFilter[ 0 ] ) );
	}
}