All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.it.unimi.dsi.big.mg4j.document.HtmlDocumentFactory Maven / Gradle / Ivy

Go to download

MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java. The big version is a fork of the original MG4J that can handle more than 2^31 terms and documents.

The newest version!
package it.unimi.dsi.big.mg4j.document;

/*		 
 * MG4J: Managing Gigabytes for Java (big)
 *
 * Copyright (C) 2005-2011 Paolo Boldi and Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import it.unimi.dsi.fastutil.chars.CharArrays;
import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.lang.ObjectParser;
import it.unimi.dsi.big.mg4j.util.MG4JClassParser;
import it.unimi.dsi.big.mg4j.util.parser.callback.AnchorExtractor;
import it.unimi.dsi.parser.BulletParser;
import it.unimi.dsi.parser.callback.ComposedCallbackBuilder;
import it.unimi.dsi.parser.callback.TextExtractor;
import it.unimi.dsi.util.Properties;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.Reader;
import java.nio.charset.Charset;

import org.apache.commons.configuration.ConfigurationException;

/** A factory that provides fields for body and title of HTML documents. 
 * It uses internally a {@link BulletParser}. 
 * A default encoding can be provided
 * using the property {@link it.unimi.dsi.big.mg4j.document.PropertyBasedDocumentFactory.MetadataKeys#ENCODING}.
 * 
 * 

By default, the {@link WordReader} provided by this factory * is just a {@link FastBufferedReader}, but you can specify * an alternative word reader using the property * {@link it.unimi.dsi.big.mg4j.document.PropertyBasedDocumentFactory.MetadataKeys#WORDREADER}. */ public class HtmlDocumentFactory extends PropertyBasedDocumentFactory { private static final long serialVersionUID = 1L; public static enum MetadataKeys { /** The maximum number of characters before an anchor. */ MAXPREANCHOR, /** The maximum number of characters in an anchor. */ MAXANCHOR, /** The maximum number of characters after an anchor. */ MAXPOSTANCHOR, }; private static final int DEFAULT_BUFFER_SIZE = 16 * 1024; /** A parser that will be used to extract text from HTML documents. */ private transient BulletParser parser; /** The callback recording text. */ private transient TextExtractor textExtractor; /** The callback for anchors. */ private transient AnchorExtractor anchorExtractor; /** The word reader used for all documents. */ private transient WordReader wordReader; /** The maximum number of characters before an anchor. */ private int maxPreAnchor; /** The maximum number of characters in an anchor. */ private int maxAnchor; /** The maximum number of characters after an anchor. */ private int maxPostAnchor; private transient char[] text; protected boolean parseProperty( final String key, final String[] values, final Reference2ObjectMap,Object> metadata ) throws ConfigurationException { if ( sameKey( PropertyBasedDocumentFactory.MetadataKeys.MIMETYPE, key ) ) { metadata.put( PropertyBasedDocumentFactory.MetadataKeys.MIMETYPE, ensureJustOne( key, values ) ); return true; } else if ( sameKey( PropertyBasedDocumentFactory.MetadataKeys.ENCODING, key ) ) { metadata.put( PropertyBasedDocumentFactory.MetadataKeys.ENCODING, Charset.forName( ensureJustOne( key, values ) ).toString() ); return true; } else if ( sameKey( PropertyBasedDocumentFactory.MetadataKeys.WORDREADER, key ) ) { try { final String spec = ( ensureJustOne( key, values ) ).toString(); metadata.put( PropertyBasedDocumentFactory.MetadataKeys.WORDREADER, spec ); // Just to check ObjectParser.fromSpec( spec, WordReader.class, MG4JClassParser.PACKAGE ); } catch ( ClassNotFoundException e ) { throw new ConfigurationException( e ); } // TODO: this must turn into a more appropriate exception catch ( Exception e ) { throw new ConfigurationException( e ); } return true; } else if ( sameKey( MetadataKeys.MAXPREANCHOR, key ) ) { metadata.put( MetadataKeys.MAXPREANCHOR, Integer.valueOf( ensureJustOne( key, values ) ) ); return true; } else if ( sameKey( MetadataKeys.MAXANCHOR, key ) ) { metadata.put( MetadataKeys.MAXANCHOR, Integer.valueOf( ensureJustOne( key, values ) ) ); return true; } else if ( sameKey( MetadataKeys.MAXPOSTANCHOR, key ) ) { metadata.put( MetadataKeys.MAXPOSTANCHOR, Integer.valueOf( ensureJustOne( key, values ) ) ); return true; } return super.parseProperty( key, values, metadata ); } private void init() { this.parser = new BulletParser(); ComposedCallbackBuilder composedBuilder = new ComposedCallbackBuilder(); composedBuilder.add( this.textExtractor = new TextExtractor() ); composedBuilder.add( this.anchorExtractor = new AnchorExtractor( maxPreAnchor, maxAnchor, maxPostAnchor ) ); parser.setCallback( composedBuilder.compose() ); Object o; try { o = defaultMetadata.get( PropertyBasedDocumentFactory.MetadataKeys.WORDREADER ); wordReader = o == null ? new FastBufferedReader() : ObjectParser.fromSpec( o.toString(), WordReader.class, MG4JClassParser.PACKAGE ); } catch ( Exception e ) { throw new RuntimeException( e ); } text = new char[ DEFAULT_BUFFER_SIZE ]; } @SuppressWarnings("boxing") private void initVars() { maxPreAnchor = (Integer)resolve( MetadataKeys.MAXPREANCHOR, defaultMetadata, 8 ); maxAnchor = (Integer)resolve( MetadataKeys.MAXANCHOR, defaultMetadata, 256 ); maxPostAnchor = (Integer)resolve( MetadataKeys.MAXPOSTANCHOR, defaultMetadata, 4 ); } /** Returns a copy of this document factory. A new parser is allocated for the copy. */ public HtmlDocumentFactory copy() { return new HtmlDocumentFactory( defaultMetadata ); } public HtmlDocumentFactory( final Properties properties ) throws ConfigurationException { super( properties ); initVars(); init(); } public HtmlDocumentFactory( final Reference2ObjectMap,Object> defaultMetadata ) { super( defaultMetadata ); initVars(); init(); } public HtmlDocumentFactory( final String[] property ) throws ConfigurationException { super( property ); initVars(); init(); } public HtmlDocumentFactory() { super(); initVars(); init(); } public int numberOfFields() { return 3; } public String fieldName( final int field ) { ensureFieldIndex( field ); switch( field ) { case 0: return "text"; case 1: return "title"; case 2: return "anchor"; default: throw new IllegalArgumentException(); } } public int fieldIndex( final String fieldName ) { for ( int i = 0; i < numberOfFields(); i++ ) if ( fieldName( i ).equals( fieldName ) ) return i; return -1; } public FieldType fieldType( final int field ) { ensureFieldIndex( field ); switch( field ) { case 0: return FieldType.TEXT; case 1: return FieldType.TEXT; case 2: return FieldType.VIRTUAL; default: throw new IllegalArgumentException(); } } private void readObject( final ObjectInputStream s ) throws IOException, ClassNotFoundException { s.defaultReadObject(); init(); } /** An HTML document. If a TITLE element is available, it will be used for {@link #title()} * instead of the default value. * *

We delay the actual parsing until it is actually necessary, so operations like * getting the document URI will not require parsing. */ protected class HtmlDocument extends AbstractDocument { private final Reference2ObjectMap,Object> metadata; /** Whether we already parsed the document. */ private boolean parsed; /** The cached raw content. */ private final InputStream rawContent; private void ensureParsed() throws IOException { if ( parsed ) return; int offset = 0, l; Reader r = new InputStreamReader( rawContent, (String)resolveNotNull( PropertyBasedDocumentFactory.MetadataKeys.ENCODING, metadata ) ); while( ( l = r.read( text, offset, text.length - offset ) ) > 0 ) { offset += l; text = CharArrays.grow( text, offset + 1 ); } parser.parse( text, 0, offset ); textExtractor.title.trim(); parsed = true; } protected HtmlDocument( final InputStream rawContent, final Reference2ObjectMap,Object> metadata ) { this.metadata = metadata; this.rawContent = rawContent; } public CharSequence title() { try { ensureParsed(); } catch ( IOException e ) { throw new RuntimeException( e ); } return (CharSequence)( textExtractor.title.length() == 0 ? resolve( PropertyBasedDocumentFactory.MetadataKeys.TITLE, metadata ): textExtractor.title ); } public String toString() { return title().toString(); } public CharSequence uri() { return (CharSequence)resolve( PropertyBasedDocumentFactory.MetadataKeys.URI, metadata ); } public Object content( final int field ) throws IOException { ensureFieldIndex( field ); ensureParsed(); switch( field ) { case 0: return new FastBufferedReader( textExtractor.text ); case 1: return new FastBufferedReader( textExtractor.title ); case 2: return anchorExtractor.anchors; default: throw new IllegalArgumentException(); } } public WordReader wordReader( final int field ) { ensureFieldIndex( field ); return wordReader; } } public Document getDocument( final InputStream rawContent, final Reference2ObjectMap,Object> metadata ) throws IOException { return new HtmlDocument( rawContent, metadata ); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy