All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.it.unimi.dsi.big.mg4j.query.QueryServlet Maven / Gradle / Ivy

Go to download

MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java. The big version is a fork of the original MG4J that can handle more than 2^31 terms and documents.

The newest version!
package it.unimi.dsi.big.mg4j.query;

/*		 
 * MG4J: Managing Gigabytes for Java (big)
 *
 * Copyright (C) 2005-2011 Sebastiano Vigna 
 *
 *  This program is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This program is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, see .
 *
 */

import it.unimi.dsi.Util;
import it.unimi.dsi.fastutil.BigList;
import it.unimi.dsi.fastutil.objects.Object2ReferenceMap;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.big.mg4j.document.Document;
import it.unimi.dsi.big.mg4j.document.DocumentCollection;
import it.unimi.dsi.big.mg4j.document.DocumentFactory;
import it.unimi.dsi.big.mg4j.index.Index;
import it.unimi.dsi.big.mg4j.query.nodes.QueryBuilderVisitorException;
import it.unimi.dsi.big.mg4j.query.parser.QueryParserException;
import it.unimi.dsi.big.mg4j.search.score.DocumentScoreInfo;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Iterator;

import javax.servlet.ServletConfig;
import javax.servlet.ServletContext;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import org.apache.commons.collections.ExtendedProperties;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.log4j.Logger;
import org.apache.velocity.Template;
import org.apache.velocity.context.Context;
import org.apache.velocity.tools.view.servlet.VelocityViewServlet;


/** A query servlet.
 * 
 * 

This class provides a basic servlet for searching a collection. * It expects some data (a collection, an index map and a path) * in the {@link javax.servlet.ServletContext} (see the code for {@link #init()}). It * can be used to search in a collection, but it is essentially a worked-out example. * *

The three parameters are q, the query, m, the maximum * number of results to be displayed, and s, the first result to be displayed. * *

Usually, the URI associated with each result is taken from the collection. Alternatively, each * result will point to the /Item path with some query arguments (doc, containing * the document pointer, uri, containing the original URI, and m, containing * an optional suggested MIME type). See, for instance, {@link it.unimi.dsi.big.mg4j.query.GenericItem} and {@link it.unimi.dsi.big.mg4j.query.InputStreamItem}. * *

The Velocity template used by this servlet can be set using the initialisation parameter * template (or using a context attribute with the same name). If you're using * this servlet via {@link HttpQueryServer}, please read the documentation therein for * information about template resolution order. * *

This servlet is thread safe. Each instance uses its own flyweight copies of the * {@linkplain it.unimi.dsi.big.mg4j.document.DocumentCollection collection} and * {@linkplain it.unimi.dsi.big.mg4j.query.QueryEngine query engine} to return the result (in particular, snippets). In a production * site it might be more sensible to pool and reuse such classes. * *

Warning: the {@link #loadConfiguration(ServletConfig)} method initialises * Velocity with some default parameters: in particular, template resolution is performed first on the classpath, then relatively to the current directory, and * finally using absolute pathnames. Watch out for template resolution issues. */ public class QueryServlet extends VelocityViewServlet { private static final long serialVersionUID = 1L; private final static Logger LOGGER = Util.getLogger( QueryServlet.class ); /** Standard maximum number of items to be displayed (may be altered with the m query parameter). */ private final static int STD_MAX_NUM_ITEMS = 10; /** The default Velocity template used by this servlet; may be overriden in the context using an attribute named template. */ protected final static String DEFAULT_TEMPLATE = "it/unimi/dsi/big/mg4j/query/query.velocity"; /** The actual template used by this servlet (default: {@link #DEFAULT_TEMPLATE}). */ protected String template; /** The query engine. */ protected QueryEngine queryEngine; /** The document collection. */ protected DocumentCollection documentCollection; /** An optional title list if the document collection is not present. */ protected BigList titleList; /** A sorted map from index names to indices: the first entry is the default index. */ protected Object2ReferenceMap indexMap; /** The indices of the fields specified in the index map, in increasing order (for document access). */ private Index[] sortedIndex; /** If not null, a MIME type suggested to the servlet. */ private String urlEncodedMimeType; /** If true, the link associated with each item must be built using the document URI. */ private boolean useUri; /** If true, URIs are files that should be derelativised. */ private boolean derelativise; @Override protected ExtendedProperties loadConfiguration( final ServletConfig config ) throws FileNotFoundException, IOException { return HttpQueryServer.setLiberalResourceLoading( super.loadConfiguration( config ) ); } @SuppressWarnings("unchecked") @Override public void init() throws ServletException { super.init(); ServletContext context = getServletContext(); if ( ( template = (String)getServletContext().getAttribute( "template" ) ) == null && ( template = getInitParameter( "template" ) ) == null ) template = DEFAULT_TEMPLATE; queryEngine = (QueryEngine)context.getAttribute( "queryEngine" ); documentCollection = (DocumentCollection)context.getAttribute( "collection" ); titleList = (BigList)context.getAttribute( "titleList" ); indexMap = queryEngine.indexMap; try { urlEncodedMimeType = URLEncoder.encode( (String)context.getAttribute( "mimeType" ), "UTF-8" ); } catch ( UnsupportedEncodingException cantHappen ) { throw new RuntimeException( cantHappen ); } useUri = context.getAttribute( "uri" ) == Boolean.TRUE; derelativise = context.getAttribute( "derelativise" ) == Boolean.TRUE; if ( documentCollection != null ) { sortedIndex = new Index[ indexMap.size() ]; indexMap.values().toArray( sortedIndex ); Arrays.sort( sortedIndex, new Comparator() { public int compare( final Index x, final Index y ) { return documentCollection.factory().fieldIndex( x.field ) - documentCollection.factory().fieldIndex( y.field ); } }); } } public Template handleRequest( final HttpServletRequest request, final HttpServletResponse response, final Context context ) { try { response.setCharacterEncoding( "UTF-8" ); // This string is URL-encoded, and with the wrong coding. //String query = request.getParameter( "q" ) != null ? new String( request.getParameter( "q" ).getBytes( "ISO-8859-1" ), "UTF-8" ) : null; String query = request.getParameter( "q" ); context.put( "action", request.getContextPath() + request.getServletPath() ); // Sanitise parameters. int start = 0, maxNumItems = STD_MAX_NUM_ITEMS; try { maxNumItems = Integer.parseInt( request.getParameter( "m" ) ); } catch( NumberFormatException dontCare ) {} try { start = Integer.parseInt( request.getParameter( "s" ) ); } catch( NumberFormatException dontCare ) {} if ( maxNumItems < 0 || maxNumItems > 1000 ) maxNumItems = STD_MAX_NUM_ITEMS; if ( start < 0 ) start = 0; if ( query != null && query.length() != 0 ) { // This is used to display again the query in the input control. context.put( "q", StringEscapeUtils.escapeHtml( query ) ); // This is used to put the query in URLs. context.put( "qUrl", URLEncoder.encode( query, "UTF-8" ) ); context.put( "firstItem", new Integer( start ) ); // First of all, we check that the query is correct long time = -System.currentTimeMillis(); ObjectArrayList>> results = new ObjectArrayList>>(); int globNumItems; try { globNumItems = queryEngine.copy().process( query, start, maxNumItems, results ); } catch( QueryBuilderVisitorException e ) { context.put( "errmsg", StringEscapeUtils.escapeHtml( e.getCause().toString() ) ); return getTemplate( template ); } catch( QueryParserException e ) { context.put( "errmsg", StringEscapeUtils.escapeHtml( e.getCause().toString() ) ); return getTemplate( template ); } catch( Exception e ) { context.put( "errmsg", StringEscapeUtils.escapeHtml( e.toString() ) ); return getTemplate( template ); } time += System.currentTimeMillis(); ObjectArrayList resultItems = new ObjectArrayList(); if ( ! results.isEmpty() ) { SelectedInterval[] selectedInterval = null; final DocumentCollection collection = documentCollection != null ? documentCollection.copy() : null; for( int i = 0; i < results.size(); i++ ) { DocumentScoreInfo> dsi = results.get( i ); LOGGER.debug( "Intervals for item " + i ); final ResultItem resultItem = new ResultItem( dsi.document, dsi.score ); resultItems.add( resultItem ); if ( collection != null ) { final Document document = collection.document( dsi.document ); // If both collection and title list are present, we override the collection title (cfr. Query) resultItem.title = StringEscapeUtils.escapeHtml( titleList != null ? titleList.get( resultItem.doc ).toString() : document.title().toString() ); if ( useUri ) { if ( document.uri() != null ) resultItem.uri = StringEscapeUtils.escapeHtml( document.uri().toString() ); } else { if ( document.uri() != null ) { String stringUri = document.uri().toString(); // TODO: this is a quick patch to get the file server running with relative files final String documentUri = URLEncoder.encode( derelativise ? new File( stringUri.startsWith( "file:" ) ? stringUri.substring( 5 ) : stringUri ).getAbsoluteFile().toURI().toASCIIString() : document.uri().toString(), "UTF-8" ); resultItem.uri = StringEscapeUtils.escapeHtml( "./Item?doc=" + resultItem.doc + "&m=" + urlEncodedMimeType + "&uri=" + documentUri ); } else resultItem.uri = StringEscapeUtils.escapeHtml( "./Item?doc=" + resultItem.doc + "&m=" + urlEncodedMimeType ); } MarkingMutableString snippet = new MarkingMutableString( TextMarker.HTML_STRONG, MarkingMutableString.HTML_ESCAPE ); for( int j = 0; j < sortedIndex.length; j++ ) { if ( ! sortedIndex[ j ].hasPositions || dsi.info == null ) continue; selectedInterval = dsi.info.get( sortedIndex[ j ] ); if ( selectedInterval != null ) { final int field = documentCollection.factory().fieldIndex( sortedIndex[ j ].field ); // If the field is not present (e.g., because of parallel indexing) or it is not text we skip if ( field == -1 || documentCollection.factory().fieldType( field ) != DocumentFactory.FieldType.TEXT ) continue; LOGGER.debug( "Found intervals for " + sortedIndex[ j ].field + " (" + field + ")" ); final Reader content = (Reader)document.content( field ); snippet.startField( selectedInterval ).appendAndMark( document.wordReader( field ).setReader( content ) ).endField(); } if ( LOGGER.isDebugEnabled() ) LOGGER.debug( sortedIndex[ j ].field + ": " + ( selectedInterval == null ? null : Arrays.asList( selectedInterval ) ) ); document.close(); } resultItem.text = snippet; } else { if ( titleList != null ) { // TODO: this is a bit radical resultItem.title = resultItem.uri = titleList.get( resultItem.doc ); } else { resultItem.title = "Document #" + resultItem.doc; resultItem.uri = new MutableString( "./Item?doc=" ).append( resultItem.doc ).append( "&m=" ).append( urlEncodedMimeType ); } MutableString text = new MutableString(); for( Iterator j = indexMap.values().iterator(); j.hasNext(); ) { final Index index = j.next(); selectedInterval = dsi.info.get( index ); if ( selectedInterval != null ) text.append( "

" ).append( index.field ).append( ": " ).append( Arrays.asList( selectedInterval ) ); LOGGER.debug( index.field + ": " + ( selectedInterval == null ? null : Arrays.asList( selectedInterval ) ) ); } resultItem.text = text; } } if ( collection != null ) collection.close(); } // Note that if we pass an array to the template we lose the possibility of measuring its length. context.put( "result", resultItems ); /* Note that this number is just the number of relevant documents met while trying to obtain the current results. Due to the short-circuit semantics of the "and then" operator, it might not reflect accurately the overall number of results of the query. */ context.put( "globNumItems", new Integer( globNumItems ) ); context.put( "start", new Integer( start ) ); context.put( "maxNumItems", new Integer( maxNumItems ) ); context.put( "time", new Integer( (int)time ) ); context.put( "speed", new Long( (int)( globNumItems * 1000L / ( time + 1 ) ) ) ); } return getTemplate( template ); } catch( Exception e ) { e.printStackTrace( System.err ); return null; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy