All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.it.unimi.di.archive4j.tool.MergePreprocessedData Maven / Gradle / Ivy

Go to download

Archive4J is a suite of tools to store compactly term/count information of a document collection.

There is a newer version: 1.3.3
Show newest version
package it.unimi.di.archive4j.tool;

/*
 * Copyright (C) 2008-2013 Alessio Orlandi and Sebastiano Vigna
 *
 *  This program is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU General Public License as published by the Free
 *  Software Foundation; either version 2 of the License, or (at your option)
 *  any later version.
 *
 *  This program is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
 */
import it.unimi.di.big.mg4j.index.TermProcessor;
import it.unimi.di.big.mg4j.util.MG4JClassParser;
import it.unimi.dsi.Util;
import it.unimi.dsi.bits.TransformationStrategies;
import it.unimi.dsi.fastutil.Hash;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.fastutil.longs.LongIterator;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.ObjectHeapSemiIndirectPriorityQueue;
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
import it.unimi.dsi.io.FileLinesCollection;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.lang.ObjectParser;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.sux4j.mph.LcpMonotoneMinimalPerfectHashFunction;
import it.unimi.dsi.sux4j.util.EliasFanoMonotoneLongBigList;
import it.unimi.dsi.util.Properties;
import it.unimi.dsi.util.ShiftAddXorSignedStringMap;
import it.unimi.dsi.util.StringMap;

import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;

import org.apache.commons.configuration.ConfigurationException;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;


/** Filters and merges term data (term lists, frequencies, global counts) originated by one or more {@linkplain Preprocess preprocessing phases}
 * and generates the corresponding {@link StringMap}.
 * 
 * 

After {@link Preprocess} has performed the first pass over a collection, this class {@linkplain TermFilter filters} the collected terms * and merges the resulting data. Additionally, it generates the following files: * *

* *
basename.termmap * *
For each filtered term, its position in the term list. * *
basename.embed * *
The embedding list for this archive. It maps every term to * its position in the non-filtered term list. It is useful to map * terms of the archive in terms of an index over the same archive. * *
* *

Strategies to remove terms are provided by {@link TermFilter} implementations. * * @author Alessio Orlandi * @author Sebastiano Vigna */ public class MergePreprocessedData { /** The extension of the map for (filtered) terms. */ public static final String TERMMAP_EXTENSION = ".termmap"; /** The extension of the embedding list. */ public static final String EMBED_EXTENSION = ".embed"; /** * Configuration keys that are used also by {@link Scan}. */ public static enum PropertyKeys { /** The number of terms. */ TERMS } /** * Interface used to specify whether a term must be filtered (that is, eliminated from the archive) or not. * * @author Alessio Orlandi */ public static interface TermFilter { /** Returns true if the given term must be accepted. * * @param term the term. * @param frequency the frequency of term. */ public boolean accept( final MutableString term, final int frequency ); } /** A filter that eliminates mixed digit-nondigit terms. */ public static class MixedFilter implements TermFilter { public boolean accept( final MutableString word, final int frequency ) { final char[] wordChars = word.array(); boolean hasDigits = false, hasNonDigits = false, keepLooping = true; for ( int i = word.length(); keepLooping && i-- != 0; ) { if ( Character.isDigit( wordChars[ i ] ) ) hasDigits = true; else hasNonDigits = true; keepLooping = hasDigits ^ hasNonDigits; } return keepLooping; } } /** A filter that eliminates too long or too short terms. */ public static class LengthFilter implements TermFilter { final int minLen, maxLen; /** Creates a new length filter. * * @param minLen the minimum length of an accepted term (inclusive). * @param maxLen the maximum length of an accepted term (exclusive). */ public LengthFilter( final int minLen, final int maxLen ) { this.minLen = minLen; this.maxLen = maxLen; } public boolean accept( final MutableString word, final int frequency ) { final int wlen = word.length(); return wlen >= minLen && wlen < maxLen; } } /** Filter that eliminates terms in a given set. */ public static class StopwordFilter implements TermFilter { private final ObjectOpenHashSet terms; /** Creates a new filter using a given iterable object of terms. * * @param terms an iterable object; the returned terms will be copied into * {@link MutableString} instances. */ public StopwordFilter( final Iterable terms ) { this.terms = new ObjectOpenHashSet( Hash.DEFAULT_INITIAL_SIZE, Hash.VERY_FAST_LOAD_FACTOR ); for( CharSequence s : terms ) this.terms.add( new MutableString( s ) ); this.terms.trim(); } public boolean accept( final MutableString word, final int frequency ) { return ! terms.contains( word ); } } /** Filter that eliminates terms that are too much or not enough frequent. */ public static class FrequencyFilter implements TermFilter { final int minFreq, maxFreq; /** Creates a new frequency filter. * * @param minFreq the minimum frequency of an accepted term (inclusive). * @param maxFreq the maximum frequency of an accepted term (exclusive). */ public FrequencyFilter( final int minFreq, final int maxFreq ) { this.minFreq = minFreq; this.maxFreq = maxFreq; } public boolean accept( final MutableString term, final int frequency ) { return frequency >= minFreq && (frequency < maxFreq || maxFreq == -1 ) ; } } private static InputBitStream[] openFrequencyFiles( final CharSequence[] inputNames ) throws IOException { final InputBitStream[] streams = new InputBitStream[ inputNames.length ]; for ( int i = 0; i < inputNames.length; i++ ) streams[ i ] = new InputBitStream( inputNames[ i ] + Preprocess.FREQUENCIES_EXTENSION ); return streams; } private static LongIterator[] openCountFiles( final CharSequence[] inputNames ) throws IOException { final LongIterator[] iterators = new LongIterator[ inputNames.length ]; for ( int i = 0; i < inputNames.length; i++ ) iterators[ i ] = BinIO.asLongIterator( inputNames[ i ] + Preprocess.COUNTS_EXTENSION ); return iterators; } private static FileLinesCollection.FileLinesIterator[] openTermFiles( final CharSequence[] inputNames ) { final FileLinesCollection.FileLinesIterator[] files = new FileLinesCollection.FileLinesIterator[ inputNames.length ]; for ( int i = 0; i < inputNames.length; i++ ) files[ i ] = new FileLinesCollection( inputNames[ i ] + Preprocess.TERMS_EXTENSION, "UTF-8" ).iterator(); return files; } /** Runs the merge process. * * @param inputBasename the basename of a previous {@link Preprocess} runs. * @param outputBasename the output basename. * @param filters term filters that will be used to choose which term to include in the * merged data. */ public static void run( final String inputBasename, final String outputBasename, final TermFilter[] filters ) throws IOException, ConfigurationException, IllegalArgumentException, ClassNotFoundException, IllegalAccessException, InvocationTargetException, InstantiationException, NoSuchMethodException { run( new String[] { inputBasename }, outputBasename, filters ); } /** Runs the merge process. * * @param inputBasename the basenames of one or more previous {@link Preprocess} runs. * @param outputBasename the output basename. * @param filters term filters that will be used to choose which term to include in the * merged data. */ public static void run( final String inputBasename[], final String outputBasename, final TermFilter[] filters ) throws IOException, ConfigurationException, IllegalArgumentException, ClassNotFoundException, IllegalAccessException, InvocationTargetException, InstantiationException, NoSuchMethodException { ArrayList inputNames = new ArrayList(); TermProcessor firstProcessor = null; String firstProcessorSpec = null; String firstField = null; int documents = 0; for( int i = 0; i < inputBasename.length; i++ ) { Properties properties = new Properties( inputBasename[ i ] + Preprocess.PROPERTIES_EXTENSION ); if ( i == 0 ){ firstProcessor = ObjectParser.fromSpec( firstProcessorSpec = properties.getString( Preprocess.PropertyKeys.TERMPROCESSOR ), TermProcessor.class, MG4JClassParser.PACKAGE, new String[] { "getInstance" } ); firstField = properties.getString( Preprocess.PropertyKeys.FIELD ); } else { if ( firstProcessor == null || ! firstProcessor.equals( ObjectParser.fromSpec( new Properties( inputBasename[ 1 ] + Preprocess.PROPERTIES_EXTENSION ).getString( Preprocess.PropertyKeys.TERMPROCESSOR ), TermProcessor.class, MG4JClassParser.PACKAGE, new String[] { "getInstance" } ) ) ) throw new IllegalArgumentException( "Preprocessed data uses inconsistent term processing" ); if ( firstField == null || ! firstField.equals( properties.getString( Preprocess.PropertyKeys.FIELD ) ) ) throw new IllegalArgumentException( "Preprocessed data uses inconsistent fieldS" ); } documents += properties.getInt( Preprocess.PropertyKeys.DOCUMENTS ); int numBatches = properties.getInt( Preprocess.PropertyKeys.BATCHES ); while( numBatches-- != 0 ) inputNames.add( Preprocess.batchName( inputBasename[ i ], numBatches ) ); } Properties properties = new Properties(); properties.setProperty( Preprocess.PropertyKeys.TERMPROCESSOR, firstProcessorSpec ); properties.setProperty( Preprocess.PropertyKeys.FIELD, firstField ); properties.setProperty( Preprocess.PropertyKeys.DOCUMENTS, documents ); properties.setFileName( outputBasename + Preprocess.PROPERTIES_EXTENSION ); run( inputNames.toArray( new String[ 0 ] ), outputBasename, filters, properties ); } /** * Runs the merge process. * * @param inputNames the basenames for all sets of term lists and frequency files to merge. * @param outputBasename the output basename. * @param properties an initialised property object containing additional properties to be * saved (usually, at least {@link Preprocess.PropertyKeys#TERMPROCESSOR} and {@link Preprocess.PropertyKeys#FIELD}). * @param filters term filters that will be used to choose which term to include in the * merged data. * */ public static void run( final CharSequence[] inputNames, final String outputBasename, final TermFilter[] filters, final Properties properties ) throws IOException, ConfigurationException { final OutputBitStream outputFreq = new OutputBitStream( outputBasename + Preprocess.FREQUENCIES_EXTENSION ); final DataOutputStream outputCounts = new DataOutputStream ( new FastBufferedOutputStream ( new FileOutputStream ( outputBasename + Preprocess.COUNTS_EXTENSION ) ) ) ; final PrintWriter outputTerms = new PrintWriter( new OutputStreamWriter( new FileOutputStream( outputBasename + Preprocess.TERMS_EXTENSION ), "UTF-8" ) ); InputBitStream[] inputFreqs = openFrequencyFiles( inputNames ); LongIterator[] inputCounts = openCountFiles( inputNames ); FileLinesCollection.FileLinesIterator[] inputTerms = openTermFiles( inputNames ); MutableString[] pqTerms = new MutableString[ inputFreqs.length ]; int[] pqFrequencies = new int[ inputFreqs.length ]; long[] pqCounts = new long[ inputFreqs.length ]; final ProgressLogger pl = new ProgressLogger(); ObjectHeapSemiIndirectPriorityQueue heap = new ObjectHeapSemiIndirectPriorityQueue( pqTerms ); for ( int i = 0; i < inputFreqs.length; i++ ) if ( inputTerms[ i ].hasNext() ) { pqTerms[ i ] = inputTerms[ i ].next(); pqFrequencies[ i ] = inputFreqs[ i ].readGamma(); pqCounts[ i ] = inputCounts[ i ].nextLong(); heap.enqueue( i ); } pl.itemsName = "terms"; pl.start( "Merging and discarding terms.." ); final int numFilters = filters.length; int numTerms = 0, numFilteredTerms = 0, freq = 0, count = 0, index; File filteredTermsFile = File.createTempFile( MergePreprocessedData.class.getName(), "filtered" ); filteredTermsFile.deleteOnExit(); DataOutputStream filteredTerms = new DataOutputStream( new FastBufferedOutputStream( new FileOutputStream( filteredTermsFile ) ) ); while ( !heap.isEmpty() ) { final MutableString current = pqTerms[ index = heap.first() ].copy(); freq = 0; count = 0; // Recover all in-file and among-file duplicates. while ( !heap.isEmpty() && pqTerms[ index = heap.first() ].equals( current ) ) { freq += pqFrequencies[ index ]; count += pqCounts[ index ]; if ( inputTerms[ index ].hasNext() ) { pqTerms[ index ] = inputTerms[ index ].next(); pqCounts[ index ] = inputCounts[ index ].nextLong(); pqFrequencies[ index ] = inputFreqs[ index ].readGamma(); heap.changed(); } else heap.dequeue(); } int j; for ( j = numFilters; j-- != 0; ) if ( ! filters[ j ].accept( current, freq ) ) break; if ( j == -1 ) { outputTerms.println( current ); outputFreq.writeGamma( freq ); outputCounts.writeLong ( count ); filteredTerms.writeInt( numTerms ); numFilteredTerms++; } numTerms++; pl.lightUpdate(); } pl.done(); pl.logger.info( "Filtered " + numFilteredTerms + " out of " + pl.count + " (" + Util.format( ( numFilteredTerms * 100.0 ) / pl.count ) + "%)" ); for ( InputBitStream i : inputFreqs ) i.close(); filteredTerms.close(); outputFreq.close(); outputTerms.close(); outputCounts.close(); BinIO.storeObject( new EliasFanoMonotoneLongBigList( numFilteredTerms, pl.count, BinIO.asIntIterator( filteredTermsFile ) ), outputBasename + EMBED_EXTENSION ); filteredTermsFile.delete(); properties.setProperty( PropertyKeys.TERMS, numFilteredTerms ); properties.save(); pl.logger.info( "Generating term map..." ); final FileLinesCollection outputCollection = new FileLinesCollection( outputBasename + Preprocess.TERMS_EXTENSION, "UTF-8" ); final StringMap smap = new ShiftAddXorSignedStringMap( outputCollection.iterator(), new LcpMonotoneMinimalPerfectHashFunction( outputCollection, TransformationStrategies.prefixFreeUtf16() ) ); BinIO.storeObject( smap, outputBasename + TERMMAP_EXTENSION ); } @SuppressWarnings("unchecked") public static void main( String[] args ) throws Exception { SimpleJSAP jsap = new SimpleJSAP( MergePreprocessedData.class.getName(), "Filters and merges term data (term lists, frequencies, etc.) originated by one or more preprocessing phases and generates the corresponding string map and embedding list.", new Parameter[] { new Switch( "remove-mixed", 'x', "remove-mixed", "Remove mixed alphanumeric terms." ), new FlaggedOption( "stopwords", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 's', "stopwords", "List of lowercase, termprocessed stopwords to be eliminated, one per line." ), new FlaggedOption( "min-length", JSAP.INTEGER_PARSER, "1", JSAP.NOT_REQUIRED, JSAP.NO_SHORTFLAG, "min-length", "Minimum length of a term." ), new FlaggedOption( "max-length", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, JSAP.NO_SHORTFLAG, "max-length", "Maximum length of a term." ), new FlaggedOption( "min-freq", JSAP.INTEGER_PARSER, "0", JSAP.NOT_REQUIRED, 'm', "min-freq", "Minimum frequency of a term." ), new FlaggedOption( "max-freq", JSAP.INTEGER_PARSER, "-1", JSAP.NOT_REQUIRED, 'M', "max-freq", "Maximum frequency of a term." ), new UnflaggedOption( "output", JSAP.STRING_PARSER, JSAP.REQUIRED, "Output basename." ), new UnflaggedOption( "input", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, JSAP.GREEDY, "Input basenames." ) } ); JSAPResult jsapResult = jsap.parse( args ); if ( jsap.messagePrinted() ) return; ObjectArrayList filters = new ObjectArrayList(); if ( jsapResult.userSpecified( "stopwords" ) ) filters.add( new StopwordFilter( new FileLinesCollection( jsapResult.getString( "stopwords" ), "UTF-8" ).allLines() ) ); if ( jsapResult.getBoolean( "remove-mixed" ) ) filters.add( new MixedFilter() ); if ( jsapResult.userSpecified( "min-freq" ) || jsapResult.userSpecified( "max-freq" ) ) filters .add( new FrequencyFilter( jsapResult.getInt( "min-freq" ), jsapResult.getInt( "max-freq" ) ) ); if ( jsapResult.userSpecified( "max-length" ) || jsapResult.userSpecified("min-length") ) filters.add( new LengthFilter( jsapResult.getInt( "min-length" ), jsapResult.getInt( "max-length" ) ) ); run( jsapResult.getStringArray( "input" ), jsapResult.getString( "output" ), filters.toArray( new TermFilter[ 0 ] ) ); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy