org.terrier.querying.DFRBagExpansionTerms Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of terrier-core Show documentation
The newest version!
/*
 * Terrier - Terabyte Retriever 
 * Webpage: http://terrier.org 
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.gla.ac.uk/
 * 
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is DFRBagExpansionTerms.java.
 *
 * The Original Code is Copyright (C) 2004-2020 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *   Gianni Amati  (original author)
 *   Ben He  
 *   Vassilis Plachouras 
 *   Craig Macdonald 
 */
package org.terrier.querying;
import gnu.trove.TIntObjectHashMap;

import java.io.IOException;
import java.util.Arrays;
import java.util.Map;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.terrier.matching.MatchingQueryTerms;
import org.terrier.matching.models.queryexpansion.QueryExpansionModel;
import org.terrier.querying.parser.SingleTermQuery;
import org.terrier.structures.BitIndexPointer;
import org.terrier.structures.CollectionStatistics;
import org.terrier.structures.DocumentIndex;
import org.terrier.structures.Lexicon;
import org.terrier.structures.LexiconEntry;
import org.terrier.structures.PostingIndex;
import org.terrier.structures.postings.IterablePosting;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.Rounding;

/**
 * This class implements a data structure of terms in the top-retrieved documents. 
 * In particular, this implementation treats the entire feedback set as a bag of words,
 * and weights term occurrences in this bag.
 * Properties:

 * expansion.mindocuments - the minimum number of documents a term must exist in 
 * before it can be considered to be informative. Defaults to 2. For more information, see
 * 	Giambattista Amati: Information Theoretic Approach to Information Extraction. FQAS 2006: 519-529 DOI 10.1007/11766254_44
 * @author Gianni Amati, Ben He, Vassilis Plachouras, Craig Macdonald
  */
public class DFRBagExpansionTerms extends ExpansionTerms {
	/** The logger used */
	protected static Logger logger = LoggerFactory.getLogger(DFRBagExpansionTerms.class);
	/** The terms in the top-retrieval documents. */
	protected TIntObjectHashMap terms;
	/** The lexicon used for retrieval. */
	protected Lexicon lexicon;
	protected PostingIndex directIndex;
	protected DocumentIndex documentIndex;
	/** The number of documents in the collection. */
	protected int numberOfDocuments;
	/** The number of tokens in the collection. */
	protected long numberOfTokens;
	/** The average document length in the collection. */
	protected double averageDocumentLength;
	/** The number of tokens in the X top ranked documents. */
	protected double totalDocumentLength;
	/**
	 * The parameter-free term weight normaliser.
	 */
	public double normaliser = 1d;

	protected int feedbackDocumentCount = 0;
	
	/** The minimum number of documents a term must occur in to be considered for expanded terms. This is not considered a parameter of query expansion, as the default value of 2 works extremely well. Set using the property expansion.mindocuments */
	int EXPANSION_MIN_DOCUMENTS = Integer.parseInt(ApplicationSetup.getProperty("expansion.mindocuments","2"));
	
	
	/**
 	* Constructs an instance of ExpansionTerms.
	* @param collStats Statistics of the used corpora
	* @param _lexicon Lexicon The lexicon used for retrieval.
	* @param _directIndex DirectIndex to use for finding terms for documents
	* @param _documentIndex DocumentIndex to use for finding statistics about documents
 	*/
	public DFRBagExpansionTerms(CollectionStatistics collStats, Lexicon _lexicon, PostingIndex _directIndex, DocumentIndex _documentIndex) {
		this.numberOfDocuments = collStats.getNumberOfDocuments();
		this.numberOfTokens = collStats.getNumberOfTokens();
		this.averageDocumentLength = collStats.getAverageDocumentLength();
        this.terms = new TIntObjectHashMap();
		this.totalDocumentLength = 0;
		this.lexicon = _lexicon;
		this.documentIndex = _documentIndex;
		this.directIndex = _directIndex;
	}

	/** Allows the totalDocumentLength to be set after the fact */
	public void setTotalDocumentLength(double totalLength)
	{
		 this.totalDocumentLength = totalLength;
	}

	@Override 
	public void setOriginalQueryTerms(MatchingQueryTerms query){
		for (String term : query.getTerms())
		{
			if (query.getStatistics(term) == null) {
				query.setTermProperty(term, lexicon.getLexiconEntry(term));
			}
		}
		super.setOriginalQueryTerms(query);
	}

	/** Returns the termids of all terms found in the top-ranked documents */
	public int[] getTermIds()
	{
		return terms.keys();
	}

	/** Returns the unique number of terms found in all the top-ranked documents */
	public int getNumberOfUniqueTerms()
	{
		return terms.size();
	}

	/** Returns expanded terms
	 * 
	 * @return terms
	 */
	public TIntObjectHashMap getExpansionTerms()
	{
		return terms;	
	}

	/**
 	* This method implements the functionality of assigning expansion weights to
	* the terms in the top-retrieved documents, and returns the most informative
	* terms among them. Conservative Query Expansion (ConservativeQE) is used if
	* the number of expanded terms is set to 0. In this case, no new query terms
	* are added to the query, only the existing ones reweighted.
	* @param numberOfExpandedTerms int The number of terms to extract from the
	*		top-retrieved documents. ConservativeQE is set if this parameter is set to 0.
	* 	* @return TermTreeNode[] The expanded terms.
 	*/
	public SingleTermQuery[] getExpandedTerms(int numberOfExpandedTerms) {
		return getExpandedTerms(numberOfExpandedTerms, model);
	}

	/* @param QEModel QueryExpansionModel the model used for query expansion */
	protected SingleTermQuery[] getExpandedTerms(int numberOfExpandedTerms, QueryExpansionModel QEModel) {
		assignWeights(QEModel);
				
		SingleTermQuery[] results = null;
		if (numberOfExpandedTerms != 0)
		{
			ExpansionTerm[] termEntries = terms.getValues(new ExpansionTerm[0]);
			//sort by descending score
			Arrays.sort(termEntries, EXPANSIONTERM_DESC_SCORE_SORTER);
			
			numberOfExpandedTerms = Math.min(termEntries.length, numberOfExpandedTerms);
			results = new SingleTermQuery[numberOfExpandedTerms];
			logger.debug("First weight = "+termEntries[0].getWeightExpansion() + " last weight="+termEntries[termEntries.length-1].getWeightExpansion());
			for (int i = 0; i < numberOfExpandedTerms; i++)
			{
				Map.Entry lee = lexicon.getLexiconEntry(termEntries[i].getTermID());
				results[i] = new SingleTermQuery(lee.getKey());
				results[i].setWeight(termEntries[i].getWeightExpansion());
			}	
		} else { //numberOfExpandedTerms=0, Conservative"QE"
		
			results = new SingleTermQuery[originalTermids.size()];
			int i=0;
			for(int termId : originalTermids.keys())
			{
				results[i] = new SingleTermQuery(originalTermids.get(termId));
				if (terms.containsKey(termId)) {
					results[i].setWeight(terms.get(termId).getWeightExpansion());
				} else {
					// query term is not found in the feedback documents. we add it with
					// weight 0, which has no real impact.
					results[i].setWeight(0d);
				}
				//if (!QEModel.PARAMETER_FREE)
				//	results[i].setWeight(results[i].getWeight()*QEModel.ROCCHIO_BETA);
				i++;
			}
		}
		return results;
	}

	/** Remove the records for a given term */
	public void deleteTerm(int termid)
	{
		terms.remove(termid);
	}

	/**
	 * Returns the weight of a given term, computed by the 
	 * specified query expansion model.
	 * @param term String the term to set the weight for.
	 * @param model QueryExpansionModel the used query expansion model.
	 * @return double the weight of the specified term.
	 */
	public double getExpansionWeight(String term, QueryExpansionModel model)
	{
		return this.getExpansionWeight(lexicon.getLexiconEntry(term).getTermId(), model);
	}
	
	/**
	 * Returns the weight of a given term.
	 * @param term String the term to get the weight for.
	 * @return double the weight of the specified term.
	 */
	public double getExpansionWeight(String term)
	{
		return this.getExpansionWeight(lexicon.getLexiconEntry(term).getTermId(), model);
	}
	/**
	 * Returns the un-normalised weight of a given term.
	 * @param term String the given term.
	 * @return The un-normalised term weight.
	 */
	public double getOriginalExpansionWeight(String term){
		return getExpansionWeight(term)*normaliser;
	}
	
	/**
	 * Returns the frequency of a given term in the top-ranked documents.
	 * @param term String the term to get the frequency for.
	 * @return double the frequency of the specified term in the top-ranked documents.
	 */
	public double getFrequency(String term){
		return this.getFrequency(lexicon.getLexiconEntry(term).getTermId());
	}
	
	/**
	 * Returns the frequency of a given term in the top-ranked documents.
	 * @param termId int the id of the term to get the frequency for.
	 * @return double the frequency of the specified term in the top-ranked documents.
	 */
	public double getFrequency(int termId){
		ExpansionTerm o = terms.get(termId);
		if (o == null)
			return 0;
		return o.getWithinDocumentFrequency();
	}

	/**
	 * Returns the number of the top-ranked documents a given term occurs in.
	 * @param termId int the id of the term to get the frequency for.
	 * @return double the document frequency of the specified term in the top-ranked documents.
	 */
	public double getDocumentFrequency(int termId){
		ExpansionTerm o = terms.get(termId);
		if (o == null)
			return 0;
		return o.getDocumentFrequency();
	}
	
	/**
	 * Assign weight to terms that are stored in ExpansionTerm[] terms.
	 * @param QEModel QueryExpansionModel the used query expansion model.
	 */
	public void assignWeights(QueryExpansionModel QEModel){
		// Set required statistics to the query expansion model
		QEModel.setTotalDocumentLength(this.totalDocumentLength);
		QEModel.setCollectionLength(this.numberOfTokens);
		QEModel.setAverageDocumentLength(this.averageDocumentLength);
		QEModel.setNumberOfDocuments(this.numberOfDocuments);
		
		// weight the terms
		int posMaxWeight = 0;
		
		ExpansionTerm[] allTerms = terms.getValues(new ExpansionTerm[0]);
		final int minDF = feedbackDocumentCount < EXPANSION_MIN_DOCUMENTS ? 0 : EXPANSION_MIN_DOCUMENTS;
		final int len = allTerms.length;
		for (int i=0; i 0 && allTerms[i].getDocumentFrequency() < minDF &&	
					!originalTermids.contains(allTerms[i].getTermID())) 
			{
				allTerms[i].setWeightExpansion(0);
				continue;
			}
			
			double TF = 0;
			//double Nt = 0;
			Map.Entry lee = lexicon.getLexiconEntry(allTerms[i].getTermID());
			if (lee == null)
			{
				logger.error("Termid " + allTerms[i].getTermID() +" was not found in the lexicon");
				continue;
			}
			TF = lee.getValue().getFrequency();
			//Nt = lee.getValue().getDocumentFrequency();
			allTerms[i].setWeightExpansion(QEModel.score(
				allTerms[i].getWithinDocumentFrequency(),
				TF
				)
			);
			logger.debug("Term " + lee.getKey() + " weight = " + allTerms[i].getWeightExpansion());
			if (allTerms[i].getWeightExpansion() > allTerms[posMaxWeight].getWeightExpansion())
				posMaxWeight = i;
		}
		
		// get the normaliser
		normaliser = allTerms[posMaxWeight].getWeightExpansion();
		if (QEModel.PARAMETER_FREE){
			QEModel.setMaxTermFrequency(allTerms[posMaxWeight].getWithinDocumentFrequency());
			normaliser = QEModel.parameterFreeNormaliser();
			if(logger.isDebugEnabled()){
				logger.info("parameter free query expansion.");
			}
		}
		if(logger.isDebugEnabled()){
			String term = lexicon.getLexiconEntry(allTerms[posMaxWeight].termID).getKey();
			logger.debug("term with the maximum weight: " + term +
				", normaliser: " + Rounding.toString(normaliser, 4));
		}
		for (int i = 0; i < len; i++){
			allTerms[i].setWeightExpansion(allTerms[i].getWeightExpansion()/normaliser);
			//expandedTerms[i].normalisedFrequency = 
			//terms[i].getWeightExpansion()/normaliser;
			if (!QEModel.PARAMETER_FREE)
				allTerms[i].setWeightExpansion(allTerms[i].getWeightExpansion()*QEModel.ROCCHIO_BETA);
				//normalisedFrequency *= QEModel.ROCCHIO_BETA;		   
		}
	}
	
	/**
	 * Returns the weight of a term with the given
	 * term identifier, computed by the specified 
	 * query expansion model.
	 * @param termId int the term identifier to set the weight for.
	 * @param model QueryExpansionModel the used query expansion model.
	 * @return double the weight of the specified term.
	 */
	public double getExpansionWeight(int termId, QueryExpansionModel model){
		double score = 0;
		ExpansionTerm o = terms.get(termId);
		if (o != null)
		{
			double TF = 0;
			//double Nt = 0;
			Map.Entry lee = lexicon.getLexiconEntry(termId);
			TF = lee.getValue().getFrequency();
			//Nt = lee.getValue().getDocumentFrequency();
			score = model.score(o.getWithinDocumentFrequency(),
					TF,
					this.totalDocumentLength,
					this.numberOfTokens,
					this.averageDocumentLength
					);
		}
		return score;
	}
	
	/**
	 * Returns the weight of a term with the given
	 * term identifier.
	 * @param termId int the term identifier to set the weight for.
	 * @return double the weight of the specified term.
	 */
	public double getExpansionWeight(int termId){
		ExpansionTerm o = terms.get(termId);
		if (o == null)
			return -1;
		return o.getWeightExpansion();
	}

	/** Returns the probability of a given termid occurring
	  * in the expansion documents. Returns the quotient
	  * document frequency in the expansion documents, divided
	  * by the total length of all the expansion documents.
	  * @param termId int the term identifier to obtain the probability
	  * @return double the probability of the term */
	public double getExpansionProbability(int termId) {
		ExpansionTerm o = terms.get(termId);
		if (o == null)
			return -1;
		return o.getDocumentFrequency() / totalDocumentLength;
	}
	/** 
	 * Adds the feedback document to the feedback set. 
	 */
	public void insertDocument(FeedbackDocument doc) throws IOException
	{
		logger.debug("Inserting docid " + doc.docid);
		insertDocument(doc.docid, doc.rank, doc.score);
	}
	/** 
	 * Adds the feedback document from the index given a docid 
	 */
	public void insertDocument(int docid, int rank, double score) throws IOException
	{
		totalDocumentLength += documentIndex.getDocumentLength(docid);
		final IterablePosting ip = directIndex.getPostings(documentIndex.getDocumentEntry(docid));
		if (ip == null)
		{
			logger.warn("document id "+docid+" not found");
			return;
		}
		while(ip.next() != IterablePosting.EOL)
		{
			this.insertTerm(ip.getId(), ip.getFrequency());
		}
		feedbackDocumentCount++;
	}
	
	/**
 	* Add a term in the X top-retrieved documents as a candidate of the 
	* expanded terms.
 	* @param termID int the integer identifier of a term
 	* @param withinDocumentFrequency double the within document 
 	*		frequency of a term
 	*/
	protected void insertTerm(int termID, double withinDocumentFrequency) {
		final ExpansionTerm et = terms.get(termID);
		if (et == null)
			terms.put(termID, new ExpansionTerm(termID, withinDocumentFrequency));
		else
			et.insertRecord(withinDocumentFrequency);
	}
}