
org.terrier.matching.ScoringMatching Maven / Gradle / Ivy
The newest version!
package org.terrier.matching;
import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Set;
import java.util.function.Predicate;
import org.apache.commons.lang3.tuple.Pair;
import org.terrier.matching.MatchingQueryTerms.MatchingTerm;
import org.terrier.matching.models.WeightingModel;
import org.terrier.structures.CollectionStatistics;
import org.terrier.structures.EntryStatistics;
import org.terrier.structures.Index;
import org.terrier.structures.Lexicon;
import org.terrier.structures.PostingIndex;
import org.terrier.structures.postings.IterablePosting;
/** Matching implementation that uses a parent Matching instance to get the docids to work with.
* Scores are replaced using the specified weighting model. Scoring is done in a DAAT fashion.
*
* @author craigm
*
*/
public class ScoringMatching extends AbstractScoringMatching {
Lexicon lexicon;
PostingIndex> invertedIndex;
CollectionStatistics cs;
ResultSet rs_input;
double[] scores;
int[] docids;
int scored = 0;
public ScoringMatching(Index _index, Matching _parent, WeightingModel _wm, Predicate>> _filter)
{
super(_index, _parent, _wm, _filter);
if (this.index != null)
{
this.lexicon = index.getLexicon();
this.invertedIndex = index.getInvertedIndex();
this.cs = index.getCollectionStatistics();
}
}
public ScoringMatching(Index _index, Matching _parent, WeightingModel _wm)
{
super(_index, _parent, _wm);
if (this.index != null)
{
this.lexicon = index.getLexicon();
this.invertedIndex = index.getInvertedIndex();
this.cs = index.getCollectionStatistics();
}
}
public ScoringMatching(Index _index, Matching _parent)
{
super(_index, _parent, null);
if (this.index != null)
{
this.lexicon = index.getLexicon();
this.invertedIndex = index.getInvertedIndex();
this.cs = index.getCollectionStatistics();
}
}
public ResultSet doMatch(String queryNumber, MatchingQueryTerms queryTerms, ResultSet rsInput, boolean keepInputScores) throws IOException
{
if (this.cs == null)
this.cs = index.getCollectionStatistics();
rs_input = rsInput;
docids = rs_input.getDocids();
final int docCount = docids.length;
scores = keepInputScores ? rs_input.getScores().clone() : new double[docCount];
//sort by ascending docid
org.terrier.sorting.HeapSort.heapSort(docids, scores, docCount);
//this smells like a hack
if (super.wm != null) {
queryTerms.forEach( qtPair -> qtPair.getValue().termModels = Arrays.asList(wm));
logger.info("ScoringMatching running for " + wm.getInfo() + ' '+ queryNumber);
} else {
logger.info("ScoringMatching running for " + queryNumber);
}
Iterator iter = queryTerms.iterator();
while(iter.hasNext())
{
MatchingTerm term = iter.next();
//check if this term has been suppressed by the filter
boolean okToScore = true;
if (filterTerm != null)
okToScore = filterTerm.test(Pair.of(term.getKey().toString(),term.getValue().getTags()));
if (! okToScore)
{
logger.debug("Term: "+term.getKey().toString()+"$"+term.getValue().getTags()+" not scored for wm " + wm.getInfo() + ' '+ queryNumber);
iter.remove();
continue;
}
}
if(queryTerms.size() ==0)
logger.warn("no terms being scored for " + queryNumber);
PostingListManager plm = new PostingListManager(index, this.cs, queryTerms, true, null, null);
plm.prepare(true);
final int terms = plm.getNumTerms();
assert(terms > 0);
String[] qTerms = new String[terms];
EntryStatistics[] entryStats = new EntryStatistics[terms];
double[] keyFreqs = new double[terms];
Set[] tags = new Set[terms];
for (int i=0; i= docid
//if (ip.next(docid) == docid)
while(ip.getId() < docid)
{
if (ip.next() == IterablePosting.EOL)
break;
}
if (ip.getId() == docid)
{//only if this posting list has a posting for docid
//save the posting for this
matching[t] = ip;
anyTermMatch = true;
score += plm.score(t);
}
}
if (anyTermMatch) {
matchingCount++;
assignScore(i, docid, score, matching);
}
}
assert matchingCount <= docids.length;
if (this.wm == null)
{
logger.info(this.getClass().getSimpleName() + " for "+terms+" terms, scored " + matchingCount + " of " + docids.length + " retrieved documents docCount="+docCount + " matchingCount="+matchingCount);
} else {
logger.info(this.getClass().getSimpleName() + " for "+this.wm.getInfo()+" on "+terms+" terms, scored " + matchingCount + " of " + docids.length + " retrieved documents docCount="+docCount + " matchingCount="+matchingCount);
}
plm.close();
finalise(docids.length);
return rs_input;
}
protected void finalise(final int numScored)
{
if (numScored == getFinalResultSet().getResultSize())
{
rs_input = getFinalResultSet();
if (sort)
rs_input.sort();
}
else
{
rs_input = getFinalResultSet();
if (sort)
rs_input.sort(numScored);
rs_input = rs_input.getResultSet(0, numScored);
}
}
protected void makeResultSet(int docCount, String[] qs, EntryStatistics[] es, double[] ks, Set[] tags)
{}
protected void assignScore(int offset, int docid, double score, IterablePosting[] postings)
{
scores[offset] = score;
if (score != 0.0d)
scored++;
}
protected ResultSet getFinalResultSet()
{
QueryResultSet rtr = new QueryResultSet(scores.length);
rtr.docids = docids;
rtr.scores = scores;
return rtr;
//return rs_input;
}
@Override
public String getInfo() {
return "ScoringMatching";
}
@Override
public void setCollectionStatistics(CollectionStatistics cs) {
this.cs = cs;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy