All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.terrier.querying.ApplyTermPipeline Maven / Gradle / Ivy

The newest version!
/*
 * Terrier - Terabyte Retriever 
 * Webpage: http://terrier.org 
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.gla.ac.uk/
 * 
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is ApplyTermPipeline.java.
 *
 * The Original Code is Copyright (C) 2017-2020 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *  Craig Macdonald
 */
package org.terrier.querying;

import java.util.Iterator;

import gnu.trove.TIntArrayList;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.terrier.matching.MatchingQueryTerms;
import org.terrier.matching.MatchingQueryTerms.MatchingTerm;
import org.terrier.matching.matchops.MultiTermOp;
import org.terrier.matching.matchops.Operator;
import org.terrier.matching.matchops.SingleTermOp;
import org.terrier.terms.BaseTermPipelineAccessor;
import org.terrier.structures.Index;
import org.terrier.structures.IndexConfigurable;
import org.terrier.structures.PropertiesIndex;
import org.terrier.terms.TermPipelineAccessor;
import org.terrier.utility.ApplicationSetup;

@ProcessPhaseRequisites(ManagerRequisite.MQT)
/** Applies the TermPipeline to the MatchingQueryTerms object.
 * The configuration of the TermPipeline is obtained based on the following configurations:
 * 
 * 1. A control setting for the query, if present
 * 2. The "termpiplines" property of the index, if present
 * 3. Failing that, the "termpipelines" property in ApplicationSetup
 * 
 * These configuration rules were introduced in Terrier 5.6. This means that previously where a property was sufficient
 * to override any stemming configuration recorded in the index at retrieval, now a control is required. However, as the index termpipeline configuration
 * is used by default, the scenarios where explicit configuration is needed are less frequent. In particular, if you create an index
 * using a particular stemmer, that stemmer should be automatically picked up from the index (some exceptions - realtime and
 * composed indices).
 */
public class ApplyTermPipeline implements Process {

	/** Logging error messages */
	private static Logger LOG = LoggerFactory.getLogger(ApplyTermPipeline.class);

	TermPipelineAccessor defaultTpa = null;
	String info = null;
	String defaultPipelineSource = null;
	String lastPipeline = null;
	TermPipelineAccessor lastTpa = null;
	boolean warnNoIndexNoControl = false;
	
	public ApplyTermPipeline()
	{
		this(ApplicationSetup.getProperty("termpipelines", "Stopwords,PorterStemmer").trim());
	}

	public ApplyTermPipeline(String pipeline)
	{
		synchronized (this) {
			defaultTpa = this.load_pipeline(pipeline);
			defaultPipelineSource = pipeline;
		}
	}
	
	/** load in the term pipeline */
	protected TermPipelineAccessor load_pipeline(final String tp)
	{
		final String[] pipes = tp.split("\\s*,\\s*");
		info = "termpipelines=" + tp;
		return new BaseTermPipelineAccessor(pipes);	
	}
	
	interface Visitor {
		boolean visit(Operator qt);
		boolean visit(SingleTermOp sqt);
		boolean visit(MultiTermOp mqt);
	}
	
	TermPipelineAccessor getPipeline(Manager manager, Request q) {
		
		assert defaultPipelineSource != null;

		String tp = null;
		Index index = q.getIndex();
		boolean hasControl = q.hasControl("termpipelines");
		boolean noIndexNoControl = false;
		
		if (! hasControl && index != null) {
			if (index instanceof PropertiesIndex) {
				PropertiesIndex pi = (PropertiesIndex) index;
				String indexProp = pi.getIndexProperty("termpipelines", null); 
				if (indexProp != null) {
					tp = indexProp;
				} else {
					// no property found
					noIndexNoControl = true;
				}
			} else {
				// not a properties index
				noIndexNoControl = true;
			}	
		}
		if (hasControl)
		{
			tp = q.getControl("termpipelines");
			noIndexNoControl = false;
		}
		if (tp == null || defaultPipelineSource.equals(tp)) {
			
			//we want to promote the use of controls over ApplicationSetup. This will warn if ApplicationSetup is used.
			//it will likely only occur for settings of MultiIndex and RealtimeIndex etc.
			if (noIndexNoControl && ! warnNoIndexNoControl)
			{
				LOG.warn("The index has no termpipelines configuration, and no control configuration is found. "+
					"Defaulting to global termpipelines configuration of '"+defaultPipelineSource+"'. " +  
					"Set a termpipelines control to remove this warning.");
				warnNoIndexNoControl = true;
			}
			
			return defaultTpa;
		}
		if (tp.equals(lastPipeline))
		{
			assert lastTpa != null;
			return lastTpa;
		}
		synchronized(this) {
			lastPipeline = tp;
			info = tp;
			lastTpa = load_pipeline(tp);
			if (index != null && lastTpa instanceof IndexConfigurable) {
				((IndexConfigurable)lastTpa).setIndex(index);
			}
			return lastTpa;
		}
		
	}
	
	@Override
	public void process(Manager manager, Request q) {
		
		TermPipelineAccessor tpa = getPipeline(manager, q);
		assert tpa != null;
		
		TIntArrayList toDel = new TIntArrayList();
		int i=-1;
		
		Visitor visitor = new Visitor()
		{
			@Override
			public boolean visit(Operator qt) {
				if(qt instanceof SingleTermOp)
				{
					return this.visit((SingleTermOp)qt);
				}
				else if(qt instanceof MultiTermOp)
				{
					return this.visit((MultiTermOp)qt);
				}
				return true;
			}
			
			@Override
			public boolean visit(SingleTermOp sqt) {
				String origTerm = sqt.getTerm();
				String newTerm;
				synchronized(tpa) {
					newTerm = tpa.pipelineTerm(origTerm);
				}
				if (newTerm == null)
					return false;
				sqt.setTerm(newTerm);
				return true;
			}

			@Override
			public boolean visit(MultiTermOp mqt) {
				Operator[] qts = mqt.getConstituents();
				boolean OK = true;
				for(Operator qt : qts) {
					//boolean OKqt = 
					this.visit(qt);
				}
				//TODO check if all required?
				return OK;
			}
			
		};
		
		MatchingQueryTerms mqt = q.getMatchingQueryTerms();
		String lastTerm = null;
		boolean dups = false;
		for(MatchingTerm t : mqt)
		{
			i++;
			boolean OK = visitor.visit(t.getKey());
			if (! OK)
				toDel.add(i);
			else
			{
				dups = dups || (t.getKey().toString().equals(lastTerm));
				lastTerm = t.getKey().toString();
			}
		}
		toDel.reverse();
		for(int removeIndex : toDel.toNativeArray())
		{
			mqt.remove(removeIndex);
		}
		
		if (! dups)
			return;
		
		MatchingTerm prev = null;
		Iterator iter = mqt.iterator();
		while(iter.hasNext())
		{
			MatchingTerm t = iter.next();
			if (prev != null 
					&& t.getKey().toString().equals(prev.getKey().toString())  // this and the previous have the same string
					&& t.getValue().equals(prev.getValue()) // this previous word has the same models, tags and requirements
					)
			{
				prev.getValue().setWeight(prev.getValue().getWeight() + t.getValue().getWeight());
				iter.remove();
			}
			prev = t;
		}
	}

	@Override
	public String getInfo() {
		return this.getClass().getSimpleName() + '(' + this.info + ')';
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy