
org.terrier.querying.DFRBagExpansionTerms Maven / Gradle / Ivy
The newest version!
/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.ac.uk/
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is DFRBagExpansionTerms.java.
*
* The Original Code is Copyright (C) 2004-2020 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
* Gianni Amati (original author)
* Ben He
* Vassilis Plachouras
* Craig Macdonald
*/
package org.terrier.querying;
import gnu.trove.TIntObjectHashMap;
import java.io.IOException;
import java.util.Arrays;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.terrier.matching.MatchingQueryTerms;
import org.terrier.matching.models.queryexpansion.QueryExpansionModel;
import org.terrier.querying.parser.SingleTermQuery;
import org.terrier.structures.BitIndexPointer;
import org.terrier.structures.CollectionStatistics;
import org.terrier.structures.DocumentIndex;
import org.terrier.structures.Lexicon;
import org.terrier.structures.LexiconEntry;
import org.terrier.structures.PostingIndex;
import org.terrier.structures.postings.IterablePosting;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.Rounding;
/**
* This class implements a data structure of terms in the top-retrieved documents.
* In particular, this implementation treats the entire feedback set as a bag of words,
* and weights term occurrences in this bag.
* Properties:
* - expansion.mindocuments - the minimum number of documents a term must exist in
* before it can be considered to be informative. Defaults to 2. For more information, see
* Giambattista Amati: Information Theoretic Approach to Information Extraction. FQAS 2006: 519-529 DOI 10.1007/11766254_44
* @author Gianni Amati, Ben He, Vassilis Plachouras, Craig Macdonald
*/
public class DFRBagExpansionTerms extends ExpansionTerms {
/** The logger used */
protected static Logger logger = LoggerFactory.getLogger(DFRBagExpansionTerms.class);
/** The terms in the top-retrieval documents. */
protected TIntObjectHashMap terms;
/** The lexicon used for retrieval. */
protected Lexicon lexicon;
protected PostingIndex> directIndex;
protected DocumentIndex documentIndex;
/** The number of documents in the collection. */
protected int numberOfDocuments;
/** The number of tokens in the collection. */
protected long numberOfTokens;
/** The average document length in the collection. */
protected double averageDocumentLength;
/** The number of tokens in the X top ranked documents. */
protected double totalDocumentLength;
/**
* The parameter-free term weight normaliser.
*/
public double normaliser = 1d;
protected int feedbackDocumentCount = 0;
/** The minimum number of documents a term must occur in to be considered for expanded terms. This is not considered a parameter of query expansion, as the default value of 2 works extremely well. Set using the property expansion.mindocuments */
int EXPANSION_MIN_DOCUMENTS = Integer.parseInt(ApplicationSetup.getProperty("expansion.mindocuments","2"));
/**
* Constructs an instance of ExpansionTerms.
* @param collStats Statistics of the used corpora
* @param _lexicon Lexicon The lexicon used for retrieval.
* @param _directIndex DirectIndex to use for finding terms for documents
* @param _documentIndex DocumentIndex to use for finding statistics about documents
*/
public DFRBagExpansionTerms(CollectionStatistics collStats, Lexicon _lexicon, PostingIndex> _directIndex, DocumentIndex _documentIndex) {
this.numberOfDocuments = collStats.getNumberOfDocuments();
this.numberOfTokens = collStats.getNumberOfTokens();
this.averageDocumentLength = collStats.getAverageDocumentLength();
this.terms = new TIntObjectHashMap();
this.totalDocumentLength = 0;
this.lexicon = _lexicon;
this.documentIndex = _documentIndex;
this.directIndex = _directIndex;
}
/** Allows the totalDocumentLength to be set after the fact */
public void setTotalDocumentLength(double totalLength)
{
this.totalDocumentLength = totalLength;
}
@Override
public void setOriginalQueryTerms(MatchingQueryTerms query){
for (String term : query.getTerms())
{
if (query.getStatistics(term) == null) {
query.setTermProperty(term, lexicon.getLexiconEntry(term));
}
}
super.setOriginalQueryTerms(query);
}
/** Returns the termids of all terms found in the top-ranked documents */
public int[] getTermIds()
{
return terms.keys();
}
/** Returns the unique number of terms found in all the top-ranked documents */
public int getNumberOfUniqueTerms()
{
return terms.size();
}
/** Returns expanded terms
*
* @return terms
*/
public TIntObjectHashMap getExpansionTerms()
{
return terms;
}
/**
* This method implements the functionality of assigning expansion weights to
* the terms in the top-retrieved documents, and returns the most informative
* terms among them. Conservative Query Expansion (ConservativeQE) is used if
* the number of expanded terms is set to 0. In this case, no new query terms
* are added to the query, only the existing ones reweighted.
* @param numberOfExpandedTerms int The number of terms to extract from the
* top-retrieved documents. ConservativeQE is set if this parameter is set to 0.
* * @return TermTreeNode[] The expanded terms.
*/
public SingleTermQuery[] getExpandedTerms(int numberOfExpandedTerms) {
return getExpandedTerms(numberOfExpandedTerms, model);
}
/* @param QEModel QueryExpansionModel the model used for query expansion */
protected SingleTermQuery[] getExpandedTerms(int numberOfExpandedTerms, QueryExpansionModel QEModel) {
assignWeights(QEModel);
SingleTermQuery[] results = null;
if (numberOfExpandedTerms != 0)
{
ExpansionTerm[] termEntries = terms.getValues(new ExpansionTerm[0]);
//sort by descending score
Arrays.sort(termEntries, EXPANSIONTERM_DESC_SCORE_SORTER);
numberOfExpandedTerms = Math.min(termEntries.length, numberOfExpandedTerms);
results = new SingleTermQuery[numberOfExpandedTerms];
logger.debug("First weight = "+termEntries[0].getWeightExpansion() + " last weight="+termEntries[termEntries.length-1].getWeightExpansion());
for (int i = 0; i < numberOfExpandedTerms; i++)
{
Map.Entry lee = lexicon.getLexiconEntry(termEntries[i].getTermID());
results[i] = new SingleTermQuery(lee.getKey());
results[i].setWeight(termEntries[i].getWeightExpansion());
}
} else { //numberOfExpandedTerms=0, Conservative"QE"
results = new SingleTermQuery[originalTermids.size()];
int i=0;
for(int termId : originalTermids.keys())
{
results[i] = new SingleTermQuery(originalTermids.get(termId));
if (terms.containsKey(termId)) {
results[i].setWeight(terms.get(termId).getWeightExpansion());
} else {
// query term is not found in the feedback documents. we add it with
// weight 0, which has no real impact.
results[i].setWeight(0d);
}
//if (!QEModel.PARAMETER_FREE)
// results[i].setWeight(results[i].getWeight()*QEModel.ROCCHIO_BETA);
i++;
}
}
return results;
}
/** Remove the records for a given term */
public void deleteTerm(int termid)
{
terms.remove(termid);
}
/**
* Returns the weight of a given term, computed by the
* specified query expansion model.
* @param term String the term to set the weight for.
* @param model QueryExpansionModel the used query expansion model.
* @return double the weight of the specified term.
*/
public double getExpansionWeight(String term, QueryExpansionModel model)
{
return this.getExpansionWeight(lexicon.getLexiconEntry(term).getTermId(), model);
}
/**
* Returns the weight of a given term.
* @param term String the term to get the weight for.
* @return double the weight of the specified term.
*/
public double getExpansionWeight(String term)
{
return this.getExpansionWeight(lexicon.getLexiconEntry(term).getTermId(), model);
}
/**
* Returns the un-normalised weight of a given term.
* @param term String the given term.
* @return The un-normalised term weight.
*/
public double getOriginalExpansionWeight(String term){
return getExpansionWeight(term)*normaliser;
}
/**
* Returns the frequency of a given term in the top-ranked documents.
* @param term String the term to get the frequency for.
* @return double the frequency of the specified term in the top-ranked documents.
*/
public double getFrequency(String term){
return this.getFrequency(lexicon.getLexiconEntry(term).getTermId());
}
/**
* Returns the frequency of a given term in the top-ranked documents.
* @param termId int the id of the term to get the frequency for.
* @return double the frequency of the specified term in the top-ranked documents.
*/
public double getFrequency(int termId){
ExpansionTerm o = terms.get(termId);
if (o == null)
return 0;
return o.getWithinDocumentFrequency();
}
/**
* Returns the number of the top-ranked documents a given term occurs in.
* @param termId int the id of the term to get the frequency for.
* @return double the document frequency of the specified term in the top-ranked documents.
*/
public double getDocumentFrequency(int termId){
ExpansionTerm o = terms.get(termId);
if (o == null)
return 0;
return o.getDocumentFrequency();
}
/**
* Assign weight to terms that are stored in ExpansionTerm[] terms.
* @param QEModel QueryExpansionModel the used query expansion model.
*/
public void assignWeights(QueryExpansionModel QEModel){
// Set required statistics to the query expansion model
QEModel.setTotalDocumentLength(this.totalDocumentLength);
QEModel.setCollectionLength(this.numberOfTokens);
QEModel.setAverageDocumentLength(this.averageDocumentLength);
QEModel.setNumberOfDocuments(this.numberOfDocuments);
// weight the terms
int posMaxWeight = 0;
ExpansionTerm[] allTerms = terms.getValues(new ExpansionTerm[0]);
final int minDF = feedbackDocumentCount < EXPANSION_MIN_DOCUMENTS ? 0 : EXPANSION_MIN_DOCUMENTS;
final int len = allTerms.length;
for (int i=0; i 0 && allTerms[i].getDocumentFrequency() < minDF &&
!originalTermids.contains(allTerms[i].getTermID()))
{
allTerms[i].setWeightExpansion(0);
continue;
}
double TF = 0;
//double Nt = 0;
Map.Entry lee = lexicon.getLexiconEntry(allTerms[i].getTermID());
if (lee == null)
{
logger.error("Termid " + allTerms[i].getTermID() +" was not found in the lexicon");
continue;
}
TF = lee.getValue().getFrequency();
//Nt = lee.getValue().getDocumentFrequency();
allTerms[i].setWeightExpansion(QEModel.score(
allTerms[i].getWithinDocumentFrequency(),
TF
)
);
logger.debug("Term " + lee.getKey() + " weight = " + allTerms[i].getWeightExpansion());
if (allTerms[i].getWeightExpansion() > allTerms[posMaxWeight].getWeightExpansion())
posMaxWeight = i;
}
// get the normaliser
normaliser = allTerms[posMaxWeight].getWeightExpansion();
if (QEModel.PARAMETER_FREE){
QEModel.setMaxTermFrequency(allTerms[posMaxWeight].getWithinDocumentFrequency());
normaliser = QEModel.parameterFreeNormaliser();
if(logger.isDebugEnabled()){
logger.info("parameter free query expansion.");
}
}
if(logger.isDebugEnabled()){
String term = lexicon.getLexiconEntry(allTerms[posMaxWeight].termID).getKey();
logger.debug("term with the maximum weight: " + term +
", normaliser: " + Rounding.toString(normaliser, 4));
}
for (int i = 0; i < len; i++){
allTerms[i].setWeightExpansion(allTerms[i].getWeightExpansion()/normaliser);
//expandedTerms[i].normalisedFrequency =
//terms[i].getWeightExpansion()/normaliser;
if (!QEModel.PARAMETER_FREE)
allTerms[i].setWeightExpansion(allTerms[i].getWeightExpansion()*QEModel.ROCCHIO_BETA);
//normalisedFrequency *= QEModel.ROCCHIO_BETA;
}
}
/**
* Returns the weight of a term with the given
* term identifier, computed by the specified
* query expansion model.
* @param termId int the term identifier to set the weight for.
* @param model QueryExpansionModel the used query expansion model.
* @return double the weight of the specified term.
*/
public double getExpansionWeight(int termId, QueryExpansionModel model){
double score = 0;
ExpansionTerm o = terms.get(termId);
if (o != null)
{
double TF = 0;
//double Nt = 0;
Map.Entry lee = lexicon.getLexiconEntry(termId);
TF = lee.getValue().getFrequency();
//Nt = lee.getValue().getDocumentFrequency();
score = model.score(o.getWithinDocumentFrequency(),
TF,
this.totalDocumentLength,
this.numberOfTokens,
this.averageDocumentLength
);
}
return score;
}
/**
* Returns the weight of a term with the given
* term identifier.
* @param termId int the term identifier to set the weight for.
* @return double the weight of the specified term.
*/
public double getExpansionWeight(int termId){
ExpansionTerm o = terms.get(termId);
if (o == null)
return -1;
return o.getWeightExpansion();
}
/** Returns the probability of a given termid occurring
* in the expansion documents. Returns the quotient
* document frequency in the expansion documents, divided
* by the total length of all the expansion documents.
* @param termId int the term identifier to obtain the probability
* @return double the probability of the term */
public double getExpansionProbability(int termId) {
ExpansionTerm o = terms.get(termId);
if (o == null)
return -1;
return o.getDocumentFrequency() / totalDocumentLength;
}
/**
* Adds the feedback document to the feedback set.
*/
public void insertDocument(FeedbackDocument doc) throws IOException
{
logger.debug("Inserting docid " + doc.docid);
insertDocument(doc.docid, doc.rank, doc.score);
}
/**
* Adds the feedback document from the index given a docid
*/
public void insertDocument(int docid, int rank, double score) throws IOException
{
totalDocumentLength += documentIndex.getDocumentLength(docid);
final IterablePosting ip = directIndex.getPostings(documentIndex.getDocumentEntry(docid));
if (ip == null)
{
logger.warn("document id "+docid+" not found");
return;
}
while(ip.next() != IterablePosting.EOL)
{
this.insertTerm(ip.getId(), ip.getFrequency());
}
feedbackDocumentCount++;
}
/**
* Add a term in the X top-retrieved documents as a candidate of the
* expanded terms.
* @param termID int the integer identifier of a term
* @param withinDocumentFrequency double the within document
* frequency of a term
*/
protected void insertTerm(int termID, double withinDocumentFrequency) {
final ExpansionTerm et = terms.get(termID);
if (et == null)
terms.put(termID, new ExpansionTerm(termID, withinDocumentFrequency));
else
et.insertRecord(withinDocumentFrequency);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy