src.it.unimi.di.mg4j.document.DispatchingDocumentFactory Maven / Gradle / Ivy
Show all versions of mg4j Show documentation
package it.unimi.di.mg4j.document;
/*
* MG4J: Managing Gigabytes for Java
*
* Copyright (C) 2005-2012 Paolo Boldi
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see .
*
*/
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.objects.Object2IntMap;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import it.unimi.dsi.fastutil.objects.Object2ObjectLinkedOpenHashMap;
import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.NullReader;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.util.Properties;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.Iterator;
import java.util.Map;
import org.apache.commons.configuration.ConfigurationException;
/** A document factory that actually dispatches the task of building documents to various factories
* according to some strategy.
*
* The strategy is specified as (an object embedding) a method that determines which factory
* should be used on the basis of the metadata that are provided to the {@link #getDocument(InputStream, Reference2ObjectMap)}
* method. Since usually the strategy will have to resolve the name of metadata, it is also passed
* this factory, so that the correct
* {@link it.unimi.di.mg4j.document.PropertyBasedDocumentFactory#resolve(Enum,Reference2ObjectMap)} method can be invoked.
*
*
Moreover, at construction one must specify, for each subfactory and for each field of this
* factory, which field of the subfactory should be used. Note that to guarantee sequential access,
* fields specified for each subfactory should appear in increasing order.
*/
public class DispatchingDocumentFactory extends PropertyBasedDocumentFactory {
private static final long serialVersionUID = 1L;
private static final boolean DEBUG = false;
/** Case-insensitive keys for metadata.
*
* @see PropertyBasedDocumentFactory.MetadataKeys
*/
public static enum MetadataKeys {
/** The property containing the (comma-separated) sequence of field names. */
FIELDNAME,
/** The property containing the key that should be checked (e.g., mimetype). */
KEY,
/** The property containing comma-separated sequence of colon-separated pairs value/document factory names. */
RULE,
/** The property containing a comma-separated list with as many items as there are factories; each item will be
* a colon-separated list of as many integers as there are fields. The k-th integer in the f-th
* list is the number of the field of the f-th factory that should be used to extract field number k,
* or -1 if the field should be empty. */
MAP
}
/** The value to be used in RULE
to introduce the default factory. Otherwise, no default factory is
* provided for documents that do not match. */
public final static String OTHERWISE_IN_RULE = "?";
/** A strategy that decides which factory is appropriate using the document metadata. */
public static interface DispatchingStrategy extends Serializable {
/** Decides the index of the factory to be used for the given metadata, possibly using
* a factory to resolve property names.
*
* @param metadata the metadata of the document to be produced.
* @param factory the factory used to resolve metadata names.
* @return the factory index.
*/
public int factoryNumber( Reference2ObjectMap,Object> metadata, PropertyBasedDocumentFactory factory );
};
/** A strategy that is based on trying to match the value of the metadata with a given key with respect to a
* certain set of values.
*/
public static class StringBasedDispatchingStrategy implements DispatchingStrategy {
private static final long serialVersionUID = 1L;
/** The key to be resolved. */
private final Enum> key;
/** The values that should be used for comparisons. */
private final Object2IntMap value;
/** The strategy works as follows: the property named key
is resolved; if this property
* is not set, the default return value of value is returned.
* Otherwise, its value is compared, using the equals
,
* method with the elements of the value
set, and the corresponding integer is returned.
*
* @param key the key to be resolved.
* @param value the map of values.
*/
public StringBasedDispatchingStrategy( final Enum> key, final Object2IntMap value ) {
this.key = key;
this.value = value;
}
public int factoryNumber( final Reference2ObjectMap,Object> metadata, final PropertyBasedDocumentFactory factory ) {
final Object val = factory.resolve( key, metadata );
if ( DEBUG ) System.out.println( "key " + key + " resolved using " + metadata + " into " + val );
return value.getInt( val );
}
};
/** The number of subfactories used. */
private int n;
/** The subfactories used. */
private DocumentFactory[] documentFactory;
/** The number of fields of this factory. */
private int numberOfFields;
/** The names of the fields. */
private String[] fieldName;
/** The types of the fields. */
private FieldType[] fieldType;
/** The array specifying how subfactory fields should be mapped into fields of this factory. More precisely,
* rename[f][k]
specifies which field of factory documentFactory[f]
should be used
* to return the field named fieldName[k]
: it is assumed that the type of the field in the subfactory
* is correct (i.e., that documentFactory[f].fieldType(k)==fieldType[k]
). The value -1 is used to
* return an empty textual field (i.e., a word reader on an empty string).
*/
private int[][] rename;
/** The strategy to be used. */
private DispatchingStrategy strategy;
/** If a {@link StringBasedDispatchingStrategy} should be used, this field represents the property key to be checked.
* Otherwise, this is null
. */
private Enum> dispatchingKey;
/** If a {@link StringBasedDispatchingStrategy} should be used, this field represents the map from values to factories. */
private Object2ObjectLinkedOpenHashMap> value2factoryClass;
private void init( final DocumentFactory[] documentFactory, final String[] fieldName,
final FieldType[] fieldType, final int[][] rename, final DispatchingStrategy strategy ) {
n = documentFactory.length;
this.documentFactory = documentFactory;
numberOfFields = fieldName.length;
this.fieldName = fieldName;
this.fieldType = fieldType;
this.rename = rename;
this.strategy = strategy;
}
// TODO: All IllegalArgumentException where ConfigurationException; check that now it's OK
private void checkAttributes() {
if ( fieldName.length != fieldType.length || rename.length != documentFactory.length || documentFactory.length != n || fieldName.length != numberOfFields ) throw new IllegalArgumentException( "Length mismatch in defining the dispatching factory");
for ( int f = 0; f < n; f++ ) {
if ( rename[ f ].length != numberOfFields ) throw new IllegalArgumentException( "The number of fields (" + numberOfFields + ") does not match the mapping rule for factory " + documentFactory[ f ].getClass().getName() );
for ( int k = 0; k < numberOfFields; k++ ) {
if ( rename[ f ][ k ] < -1 || rename[ f ][ k ] >= documentFactory[ f ].numberOfFields() )
throw new IllegalArgumentException( rename[ f ][ k ] + " is not a field of factory " + documentFactory[ f ] );
if ( rename[ f ][ k ] >= 0 && fieldType[ k ] != documentFactory[ f ].fieldType( rename[ f ][ k ] ) )
throw new IllegalArgumentException( "Field " + rename[ f ][ k ] + " of factory " + documentFactory[ f ] + " has a type different from the type of the field it is mapped to" );
}
}
if ( n == 0 || numberOfFields == 0 ) throw new IllegalArgumentException( "Zero factories or fields specified" );
if ( strategy == null ) throw new IllegalArgumentException( "No strategy was specified" );
}
private void setExtraArguments( final Object xtraPars ) throws IllegalArgumentException {
if ( value2factoryClass == null ) throw new IllegalArgumentException( "No " + MetadataKeys.RULE + " property was specified for the dispatching factory" );
n = value2factoryClass.values().size();
documentFactory = new DocumentFactory[ n ];
Iterator> it = value2factoryClass.values().iterator();
for ( int f = 0; f < n; f++ ) {
Class extends DocumentFactory> documentFactoryClass = it.next();
try {
if ( xtraPars == null )
documentFactory[ f ] = documentFactoryClass.newInstance();
else
documentFactory[ f ] = documentFactoryClass.getConstructor( xtraPars.getClass() ).newInstance( xtraPars );
} catch ( Exception e ) {
throw new IllegalArgumentException( e );
}
}
fieldType = new FieldType[ numberOfFields ];
if ( rename == null ) throw new IllegalArgumentException( "No " + MetadataKeys.MAP + " property was specified for the dispatching factory" );
for ( int f = 0; f < n; f++ ) {
for ( int k = 0; k < numberOfFields; k++ ) {
int kk = rename[ f ][ k ];
if ( kk >= 0 && fieldType[ k ] != null && fieldType[ k ] != documentFactory[ f ].fieldType( kk ) )
throw new IllegalArgumentException( "Mismatch between field types for field " + f + ", relative to the remapping of factory " + documentFactory[ f ].getClass().getName() + " (the type used to be " + fieldType[ k ] + ", but now we want it to be " + documentFactory[ f ].fieldType( kk ) + ")" );
if ( kk >= 0 ) fieldType[ k ] = documentFactory[ f ].fieldType( kk );
}
}
for ( int f = 0; f < numberOfFields; f++ )
if ( fieldType[ f ] == null ) throw new IllegalArgumentException( "The type of field " + fieldName[ f ] + " could not be deduced, because it is never mapped to" );
if ( dispatchingKey == null ) throw new IllegalArgumentException( "No " + MetadataKeys.KEY + " property was specified for the dispatching factory" );
Object2IntMap value2int = new Object2IntOpenHashMap();
value2int.defaultReturnValue( -1 );
for( Map.Entry> e : value2factoryClass.entrySet() ) {
int k;
for ( k = 0; k < n; k++ )
if ( e.getValue() == documentFactory[ k ].getClass() ) {
if ( e.getKey().equals( OTHERWISE_IN_RULE ) ) value2int.defaultReturnValue( k );
else value2int.put( e.getKey(), k );
break;
}
if ( k == n ) throw new IllegalArgumentException( "Mismatch in the rule mapping " + e.getKey() + " to " + e.getValue() );
}
System.out.println( "Building a strategy mapping " + dispatchingKey + " to " + value2int );
strategy = new StringBasedDispatchingStrategy( dispatchingKey, value2int );
}
/** Creates a new dispatching factory.
*
* @param documentFactory the array of subfactories.
* @param fieldName the names of this factory's fields.
* @param fieldType the types of this factory's fields.
* @param rename the way fields of this class are mapped to fields of the subfactories.
* @param strategy the strategy to decide which factory should be used.
*/
public DispatchingDocumentFactory( final DocumentFactory[] documentFactory, final String[] fieldName,
final FieldType[] fieldType, final int[][] rename, final DispatchingStrategy strategy ){
init( documentFactory, fieldName, fieldType, rename, strategy );
checkAttributes();
}
public DispatchingDocumentFactory copy() {
final DocumentFactory[] documentFactory = new DocumentFactory[ this.documentFactory.length ];
for( int i = documentFactory.length; i-- != 0; ) documentFactory[ i ] = this.documentFactory[ i ].copy();
return new DispatchingDocumentFactory( documentFactory, fieldName, fieldType, rename, strategy );
}
public DispatchingDocumentFactory( final Properties properties ) throws ConfigurationException {
super( properties );
setExtraArguments( properties );
checkAttributes();
}
public DispatchingDocumentFactory( final String[] property ) throws ConfigurationException {
super( property );
setExtraArguments( property );
checkAttributes();
}
public DispatchingDocumentFactory( final Reference2ObjectMap,Object> defaultMetadata ) {
super( defaultMetadata );
checkAttributes(); // Will certainly fail because the configuration is actually missing
}
public DispatchingDocumentFactory() {
super();
checkAttributes(); // Will certainly fail because the configuration is actually missing
}
@SuppressWarnings({ "unchecked", "rawtypes" })
@Override
protected boolean parseProperty( final String key, final String[] values, final Reference2ObjectMap,Object> metadata ) throws ConfigurationException {
if ( sameKey( MetadataKeys.FIELDNAME, key ) ) {
fieldName = values;
numberOfFields = fieldName.length;
return true;
}
else if ( sameKey( MetadataKeys.KEY, key ) ) {
final String dispatchingKeyName = ensureJustOne( key, values );
final int lastDot = dispatchingKeyName.lastIndexOf( '.' );
try {
dispatchingKey = Enum.valueOf( (Class)Class.forName( dispatchingKeyName.substring( 0, lastDot ) ),
dispatchingKeyName.substring( lastDot + 1) );
}
catch ( ClassNotFoundException e ) {
throw new IllegalArgumentException( "The class specified in the key " + dispatchingKeyName + " cannot be found" );
}
return true;
}
else if ( sameKey( MetadataKeys.RULE, key ) ) {
String[] rules = values;
value2factoryClass = new Object2ObjectLinkedOpenHashMap>();
int i, m = rules.length;
for ( i = 0; i < m; i++ ) {
int pos = rules[ i ].indexOf( ':' );
if ( pos <= 0 || pos == rules[ i ].length() - 1 ) throw new ConfigurationException( "Rule " + rules[ i ] + " does not contain a colon or it is malformed" );
if ( rules[ i ].indexOf( ':', pos + 1 ) >= 0 ) throw new ConfigurationException( "Rule " + rules[ i ] + " contains too many colons" );
String factoryName = rules[ i ].substring( pos + 1 );
Class extends DocumentFactory> factoryClass = null;
try {
factoryClass = (Class extends DocumentFactory>)Class.forName( factoryName );
if ( ! ( DocumentFactory.class.isAssignableFrom( factoryClass ) ) ) throw new ClassNotFoundException();
} catch ( ClassNotFoundException e ) {
throw new ConfigurationException( "ParsingFactory " + factoryName + " is invalid; maybe the package name is missing" );
}
value2factoryClass.put( rules[ i ].substring( 0, pos ), factoryClass );
}
m = value2factoryClass.values().size();
return true;
}
else if ( sameKey( MetadataKeys.MAP, key ) ) {
String[] pieces = values;
int i, m = pieces.length;
rename = new int[ m ][];
for ( i = 0; i < m; i++ ) {
String[] subpieces = pieces[ i ].split( ":" );
if ( i > 0 && subpieces.length != rename[ 0 ].length ) throw new ConfigurationException( "Length mismatch in the map " + values );
rename[ i ] = new int[ subpieces.length ];
for ( int k = 0; k < subpieces.length; k++ ) {
try {
rename[ i ][ k ] = Integer.parseInt( subpieces[ k ] );
} catch ( NumberFormatException e ) {
throw new ConfigurationException( "Number format exception in the map " + values );
}
}
}
}
return super.parseProperty( key, values, metadata );
}
public int numberOfFields() {
return numberOfFields;
}
public String fieldName( final int field ) {
ensureFieldIndex( field );
return fieldName[ field ];
}
public int fieldIndex( final String fieldName ) {
for ( int k = 0; k < numberOfFields; k++ )
if ( this.fieldName[ k ].equals( fieldName ) ) return k;
return -1;
}
public FieldType fieldType( final int field ) {
ensureFieldIndex( field );
return fieldType[ field ];
}
/** A word reader that is returned when a null field should be returned. */
final private WordReader nullReader = new FastBufferedReader();
public Document getDocument( final InputStream rawContent, final Reference2ObjectMap,Object> metadata ) throws IOException {
final int factoryIndex = strategy.factoryNumber( metadata, this );
System.out.println( "The strategy returned " + factoryIndex );
if ( factoryIndex < 0 || factoryIndex >= n ) throw new IllegalArgumentException();
System.out.println( "Going to parse a document with " + metadata + ", using " + documentFactory[ factoryIndex ].getClass().getName() );
final DocumentFactory factory = documentFactory[ factoryIndex ];
final Document document = factory.getDocument( rawContent, metadata );
return new AbstractDocument() {
public CharSequence title() {
return document.title();
}
public String toString() {
return document.toString();
}
public CharSequence uri() {
return document.uri();
}
public Object content( final int field ) throws IOException {
ensureFieldIndex( field );
if ( rename[ factoryIndex ][ field ] < 0 ) return NullReader.getInstance();
return document.content( rename[ factoryIndex ][ field ] );
}
public WordReader wordReader( final int field ) {
ensureFieldIndex( field );
if ( rename[ factoryIndex ][ field ] < 0 ) return nullReader;
return document.wordReader( rename[ factoryIndex ][ field ] );
}
public void close() throws IOException {
super.close();
document.close();
}
};
}
public static void main( final String[] arg ) throws IOException, ConfigurationException {
//PdfDocumentFactory pdfFactory = new PdfDocumentFactory();
//HtmlDocumentFactory htmlFactory = new HtmlDocumentFactory();
//IdentityDocumentFactory idFactory = new IdentityDocumentFactory();
//Object2IntMap map = new Object2IntOpenHashMap(
// new String[] { "application/pdf", "text/html" },
// new int[] { 0, 1 }
// );
//map.defaultReturnValue( 2 );
//DispatchingStrategy strategy = new StringBasedDispatchingStrategy( MetadataKeys.MIMETYPE, map );
Properties p = new Properties();
p.addProperty( MetadataKeys.FIELDNAME.name().toLowerCase(), "text,title" );
p.addProperty( MetadataKeys.KEY.name().toLowerCase(), PropertyBasedDocumentFactory.MetadataKeys.MIMETYPE.name() );
p.addProperty( MetadataKeys.RULE.name().toLowerCase(), "application/pdf:it.unimi.di.mg4j.document.PdfDocumentFactory,text/html:it.unimi.di.mg4j.document.HtmlDocumentFactory,?:it.unimi.di.mg4j.document.IdentityDocumentFactory" );
p.addProperty( MetadataKeys.MAP.name().toLowerCase(), "0:-1,0:1,0:-1" );
p.addProperty( MetadataKeys.MAP.name().toLowerCase(), "0:-1,0:1,0:-1" );
p.addProperty( MetadataKeys.MAP.name().toLowerCase(), "0:-1,0:1,0:-1" );
p.addProperty( PropertyBasedDocumentFactory.MetadataKeys.ENCODING.name().toLowerCase(), "iso-8859-1" );
DispatchingDocumentFactory factory = new DispatchingDocumentFactory( p );
DocumentCollection dc = new FileSetDocumentCollection( arg, factory );
BinIO.storeObject( dc, "test.collection" );
}
}