All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.it.unimi.di.mg4j.document.DispatchingDocumentFactory Maven / Gradle / Ivy

Go to download

MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java.

There is a newer version: 5.2.2
Show newest version
package it.unimi.di.mg4j.document;

/*		 
 * MG4J: Managing Gigabytes for Java
 *
 * Copyright (C) 2005-2012 Paolo Boldi 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.objects.Object2IntMap;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import it.unimi.dsi.fastutil.objects.Object2ObjectLinkedOpenHashMap;
import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.NullReader;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.util.Properties;

import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.Iterator;
import java.util.Map;

import org.apache.commons.configuration.ConfigurationException;

/** A document factory that actually dispatches the task of building documents to various factories
 *  according to some strategy.
 * 
 * 

The strategy is specified as (an object embedding) a method that determines which factory * should be used on the basis of the metadata that are provided to the {@link #getDocument(InputStream, Reference2ObjectMap)} * method. Since usually the strategy will have to resolve the name of metadata, it is also passed * this factory, so that the correct * {@link it.unimi.di.mg4j.document.PropertyBasedDocumentFactory#resolve(Enum,Reference2ObjectMap)} method can be invoked. * *

Moreover, at construction one must specify, for each subfactory and for each field of this * factory, which field of the subfactory should be used. Note that to guarantee sequential access, * fields specified for each subfactory should appear in increasing order. */ public class DispatchingDocumentFactory extends PropertyBasedDocumentFactory { private static final long serialVersionUID = 1L; private static final boolean DEBUG = false; /** Case-insensitive keys for metadata. * * @see PropertyBasedDocumentFactory.MetadataKeys */ public static enum MetadataKeys { /** The property containing the (comma-separated) sequence of field names. */ FIELDNAME, /** The property containing the key that should be checked (e.g., mimetype). */ KEY, /** The property containing comma-separated sequence of colon-separated pairs value/document factory names. */ RULE, /** The property containing a comma-separated list with as many items as there are factories; each item will be * a colon-separated list of as many integers as there are fields. The k-th integer in the f-th * list is the number of the field of the f-th factory that should be used to extract field number k, * or -1 if the field should be empty. */ MAP } /** The value to be used in RULE to introduce the default factory. Otherwise, no default factory is * provided for documents that do not match. */ public final static String OTHERWISE_IN_RULE = "?"; /** A strategy that decides which factory is appropriate using the document metadata. */ public static interface DispatchingStrategy extends Serializable { /** Decides the index of the factory to be used for the given metadata, possibly using * a factory to resolve property names. * * @param metadata the metadata of the document to be produced. * @param factory the factory used to resolve metadata names. * @return the factory index. */ public int factoryNumber( Reference2ObjectMap,Object> metadata, PropertyBasedDocumentFactory factory ); }; /** A strategy that is based on trying to match the value of the metadata with a given key with respect to a * certain set of values. */ public static class StringBasedDispatchingStrategy implements DispatchingStrategy { private static final long serialVersionUID = 1L; /** The key to be resolved. */ private final Enum key; /** The values that should be used for comparisons. */ private final Object2IntMap value; /** The strategy works as follows: the property named key is resolved; if this property * is not set, the default return value of value is returned. * Otherwise, its value is compared, using the equals, * method with the elements of the value set, and the corresponding integer is returned. * * @param key the key to be resolved. * @param value the map of values. */ public StringBasedDispatchingStrategy( final Enum key, final Object2IntMap value ) { this.key = key; this.value = value; } public int factoryNumber( final Reference2ObjectMap,Object> metadata, final PropertyBasedDocumentFactory factory ) { final Object val = factory.resolve( key, metadata ); if ( DEBUG ) System.out.println( "key " + key + " resolved using " + metadata + " into " + val ); return value.getInt( val ); } }; /** The number of subfactories used. */ private int n; /** The subfactories used. */ private DocumentFactory[] documentFactory; /** The number of fields of this factory. */ private int numberOfFields; /** The names of the fields. */ private String[] fieldName; /** The types of the fields. */ private FieldType[] fieldType; /** The array specifying how subfactory fields should be mapped into fields of this factory. More precisely, * rename[f][k] specifies which field of factory documentFactory[f] should be used * to return the field named fieldName[k]: it is assumed that the type of the field in the subfactory * is correct (i.e., that documentFactory[f].fieldType(k)==fieldType[k]). The value -1 is used to * return an empty textual field (i.e., a word reader on an empty string). */ private int[][] rename; /** The strategy to be used. */ private DispatchingStrategy strategy; /** If a {@link StringBasedDispatchingStrategy} should be used, this field represents the property key to be checked. * Otherwise, this is null. */ private Enum dispatchingKey; /** If a {@link StringBasedDispatchingStrategy} should be used, this field represents the map from values to factories. */ private Object2ObjectLinkedOpenHashMap> value2factoryClass; private void init( final DocumentFactory[] documentFactory, final String[] fieldName, final FieldType[] fieldType, final int[][] rename, final DispatchingStrategy strategy ) { n = documentFactory.length; this.documentFactory = documentFactory; numberOfFields = fieldName.length; this.fieldName = fieldName; this.fieldType = fieldType; this.rename = rename; this.strategy = strategy; } // TODO: All IllegalArgumentException where ConfigurationException; check that now it's OK private void checkAttributes() { if ( fieldName.length != fieldType.length || rename.length != documentFactory.length || documentFactory.length != n || fieldName.length != numberOfFields ) throw new IllegalArgumentException( "Length mismatch in defining the dispatching factory"); for ( int f = 0; f < n; f++ ) { if ( rename[ f ].length != numberOfFields ) throw new IllegalArgumentException( "The number of fields (" + numberOfFields + ") does not match the mapping rule for factory " + documentFactory[ f ].getClass().getName() ); for ( int k = 0; k < numberOfFields; k++ ) { if ( rename[ f ][ k ] < -1 || rename[ f ][ k ] >= documentFactory[ f ].numberOfFields() ) throw new IllegalArgumentException( rename[ f ][ k ] + " is not a field of factory " + documentFactory[ f ] ); if ( rename[ f ][ k ] >= 0 && fieldType[ k ] != documentFactory[ f ].fieldType( rename[ f ][ k ] ) ) throw new IllegalArgumentException( "Field " + rename[ f ][ k ] + " of factory " + documentFactory[ f ] + " has a type different from the type of the field it is mapped to" ); } } if ( n == 0 || numberOfFields == 0 ) throw new IllegalArgumentException( "Zero factories or fields specified" ); if ( strategy == null ) throw new IllegalArgumentException( "No strategy was specified" ); } private void setExtraArguments( final Object xtraPars ) throws IllegalArgumentException { if ( value2factoryClass == null ) throw new IllegalArgumentException( "No " + MetadataKeys.RULE + " property was specified for the dispatching factory" ); n = value2factoryClass.values().size(); documentFactory = new DocumentFactory[ n ]; Iterator> it = value2factoryClass.values().iterator(); for ( int f = 0; f < n; f++ ) { Class documentFactoryClass = it.next(); try { if ( xtraPars == null ) documentFactory[ f ] = documentFactoryClass.newInstance(); else documentFactory[ f ] = documentFactoryClass.getConstructor( xtraPars.getClass() ).newInstance( xtraPars ); } catch ( Exception e ) { throw new IllegalArgumentException( e ); } } fieldType = new FieldType[ numberOfFields ]; if ( rename == null ) throw new IllegalArgumentException( "No " + MetadataKeys.MAP + " property was specified for the dispatching factory" ); for ( int f = 0; f < n; f++ ) { for ( int k = 0; k < numberOfFields; k++ ) { int kk = rename[ f ][ k ]; if ( kk >= 0 && fieldType[ k ] != null && fieldType[ k ] != documentFactory[ f ].fieldType( kk ) ) throw new IllegalArgumentException( "Mismatch between field types for field " + f + ", relative to the remapping of factory " + documentFactory[ f ].getClass().getName() + " (the type used to be " + fieldType[ k ] + ", but now we want it to be " + documentFactory[ f ].fieldType( kk ) + ")" ); if ( kk >= 0 ) fieldType[ k ] = documentFactory[ f ].fieldType( kk ); } } for ( int f = 0; f < numberOfFields; f++ ) if ( fieldType[ f ] == null ) throw new IllegalArgumentException( "The type of field " + fieldName[ f ] + " could not be deduced, because it is never mapped to" ); if ( dispatchingKey == null ) throw new IllegalArgumentException( "No " + MetadataKeys.KEY + " property was specified for the dispatching factory" ); Object2IntMap value2int = new Object2IntOpenHashMap(); value2int.defaultReturnValue( -1 ); for( Map.Entry> e : value2factoryClass.entrySet() ) { int k; for ( k = 0; k < n; k++ ) if ( e.getValue() == documentFactory[ k ].getClass() ) { if ( e.getKey().equals( OTHERWISE_IN_RULE ) ) value2int.defaultReturnValue( k ); else value2int.put( e.getKey(), k ); break; } if ( k == n ) throw new IllegalArgumentException( "Mismatch in the rule mapping " + e.getKey() + " to " + e.getValue() ); } System.out.println( "Building a strategy mapping " + dispatchingKey + " to " + value2int ); strategy = new StringBasedDispatchingStrategy( dispatchingKey, value2int ); } /** Creates a new dispatching factory. * * @param documentFactory the array of subfactories. * @param fieldName the names of this factory's fields. * @param fieldType the types of this factory's fields. * @param rename the way fields of this class are mapped to fields of the subfactories. * @param strategy the strategy to decide which factory should be used. */ public DispatchingDocumentFactory( final DocumentFactory[] documentFactory, final String[] fieldName, final FieldType[] fieldType, final int[][] rename, final DispatchingStrategy strategy ){ init( documentFactory, fieldName, fieldType, rename, strategy ); checkAttributes(); } public DispatchingDocumentFactory copy() { final DocumentFactory[] documentFactory = new DocumentFactory[ this.documentFactory.length ]; for( int i = documentFactory.length; i-- != 0; ) documentFactory[ i ] = this.documentFactory[ i ].copy(); return new DispatchingDocumentFactory( documentFactory, fieldName, fieldType, rename, strategy ); } public DispatchingDocumentFactory( final Properties properties ) throws ConfigurationException { super( properties ); setExtraArguments( properties ); checkAttributes(); } public DispatchingDocumentFactory( final String[] property ) throws ConfigurationException { super( property ); setExtraArguments( property ); checkAttributes(); } public DispatchingDocumentFactory( final Reference2ObjectMap,Object> defaultMetadata ) { super( defaultMetadata ); checkAttributes(); // Will certainly fail because the configuration is actually missing } public DispatchingDocumentFactory() { super(); checkAttributes(); // Will certainly fail because the configuration is actually missing } @SuppressWarnings({ "unchecked", "rawtypes" }) @Override protected boolean parseProperty( final String key, final String[] values, final Reference2ObjectMap,Object> metadata ) throws ConfigurationException { if ( sameKey( MetadataKeys.FIELDNAME, key ) ) { fieldName = values; numberOfFields = fieldName.length; return true; } else if ( sameKey( MetadataKeys.KEY, key ) ) { final String dispatchingKeyName = ensureJustOne( key, values ); final int lastDot = dispatchingKeyName.lastIndexOf( '.' ); try { dispatchingKey = Enum.valueOf( (Class)Class.forName( dispatchingKeyName.substring( 0, lastDot ) ), dispatchingKeyName.substring( lastDot + 1) ); } catch ( ClassNotFoundException e ) { throw new IllegalArgumentException( "The class specified in the key " + dispatchingKeyName + " cannot be found" ); } return true; } else if ( sameKey( MetadataKeys.RULE, key ) ) { String[] rules = values; value2factoryClass = new Object2ObjectLinkedOpenHashMap>(); int i, m = rules.length; for ( i = 0; i < m; i++ ) { int pos = rules[ i ].indexOf( ':' ); if ( pos <= 0 || pos == rules[ i ].length() - 1 ) throw new ConfigurationException( "Rule " + rules[ i ] + " does not contain a colon or it is malformed" ); if ( rules[ i ].indexOf( ':', pos + 1 ) >= 0 ) throw new ConfigurationException( "Rule " + rules[ i ] + " contains too many colons" ); String factoryName = rules[ i ].substring( pos + 1 ); Class factoryClass = null; try { factoryClass = (Class)Class.forName( factoryName ); if ( ! ( DocumentFactory.class.isAssignableFrom( factoryClass ) ) ) throw new ClassNotFoundException(); } catch ( ClassNotFoundException e ) { throw new ConfigurationException( "ParsingFactory " + factoryName + " is invalid; maybe the package name is missing" ); } value2factoryClass.put( rules[ i ].substring( 0, pos ), factoryClass ); } m = value2factoryClass.values().size(); return true; } else if ( sameKey( MetadataKeys.MAP, key ) ) { String[] pieces = values; int i, m = pieces.length; rename = new int[ m ][]; for ( i = 0; i < m; i++ ) { String[] subpieces = pieces[ i ].split( ":" ); if ( i > 0 && subpieces.length != rename[ 0 ].length ) throw new ConfigurationException( "Length mismatch in the map " + values ); rename[ i ] = new int[ subpieces.length ]; for ( int k = 0; k < subpieces.length; k++ ) { try { rename[ i ][ k ] = Integer.parseInt( subpieces[ k ] ); } catch ( NumberFormatException e ) { throw new ConfigurationException( "Number format exception in the map " + values ); } } } } return super.parseProperty( key, values, metadata ); } public int numberOfFields() { return numberOfFields; } public String fieldName( final int field ) { ensureFieldIndex( field ); return fieldName[ field ]; } public int fieldIndex( final String fieldName ) { for ( int k = 0; k < numberOfFields; k++ ) if ( this.fieldName[ k ].equals( fieldName ) ) return k; return -1; } public FieldType fieldType( final int field ) { ensureFieldIndex( field ); return fieldType[ field ]; } /** A word reader that is returned when a null field should be returned. */ final private WordReader nullReader = new FastBufferedReader(); public Document getDocument( final InputStream rawContent, final Reference2ObjectMap,Object> metadata ) throws IOException { final int factoryIndex = strategy.factoryNumber( metadata, this ); System.out.println( "The strategy returned " + factoryIndex ); if ( factoryIndex < 0 || factoryIndex >= n ) throw new IllegalArgumentException(); System.out.println( "Going to parse a document with " + metadata + ", using " + documentFactory[ factoryIndex ].getClass().getName() ); final DocumentFactory factory = documentFactory[ factoryIndex ]; final Document document = factory.getDocument( rawContent, metadata ); return new AbstractDocument() { public CharSequence title() { return document.title(); } public String toString() { return document.toString(); } public CharSequence uri() { return document.uri(); } public Object content( final int field ) throws IOException { ensureFieldIndex( field ); if ( rename[ factoryIndex ][ field ] < 0 ) return NullReader.getInstance(); return document.content( rename[ factoryIndex ][ field ] ); } public WordReader wordReader( final int field ) { ensureFieldIndex( field ); if ( rename[ factoryIndex ][ field ] < 0 ) return nullReader; return document.wordReader( rename[ factoryIndex ][ field ] ); } public void close() throws IOException { super.close(); document.close(); } }; } public static void main( final String[] arg ) throws IOException, ConfigurationException { //PdfDocumentFactory pdfFactory = new PdfDocumentFactory(); //HtmlDocumentFactory htmlFactory = new HtmlDocumentFactory(); //IdentityDocumentFactory idFactory = new IdentityDocumentFactory(); //Object2IntMap map = new Object2IntOpenHashMap( // new String[] { "application/pdf", "text/html" }, // new int[] { 0, 1 } // ); //map.defaultReturnValue( 2 ); //DispatchingStrategy strategy = new StringBasedDispatchingStrategy( MetadataKeys.MIMETYPE, map ); Properties p = new Properties(); p.addProperty( MetadataKeys.FIELDNAME.name().toLowerCase(), "text,title" ); p.addProperty( MetadataKeys.KEY.name().toLowerCase(), PropertyBasedDocumentFactory.MetadataKeys.MIMETYPE.name() ); p.addProperty( MetadataKeys.RULE.name().toLowerCase(), "application/pdf:it.unimi.di.mg4j.document.PdfDocumentFactory,text/html:it.unimi.di.mg4j.document.HtmlDocumentFactory,?:it.unimi.di.mg4j.document.IdentityDocumentFactory" ); p.addProperty( MetadataKeys.MAP.name().toLowerCase(), "0:-1,0:1,0:-1" ); p.addProperty( MetadataKeys.MAP.name().toLowerCase(), "0:-1,0:1,0:-1" ); p.addProperty( MetadataKeys.MAP.name().toLowerCase(), "0:-1,0:1,0:-1" ); p.addProperty( PropertyBasedDocumentFactory.MetadataKeys.ENCODING.name().toLowerCase(), "iso-8859-1" ); DispatchingDocumentFactory factory = new DispatchingDocumentFactory( p ); DocumentCollection dc = new FileSetDocumentCollection( arg, factory ); BinIO.storeObject( dc, "test.collection" ); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy