All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.github.repir.Strategy.Collector.CollectorDocument Maven / Gradle / Ivy

The newest version!
package io.github.repir.Strategy.Collector;

import io.github.repir.Repository.ReportedUnstoredFeature;
import io.github.repir.Retriever.Document;
import io.github.repir.Retriever.DocumentComparator;
import io.github.repir.Retriever.DocumentComparatorScore;
import io.github.repir.Retriever.ReportedFeature;
import io.github.repir.Strategy.GraphComponent.ANNOUNCEKEY;
import io.github.repir.Strategy.Operator.Operator;
import io.github.repir.Strategy.RetrievalModel;
import io.github.repir.Strategy.ScoreFunction.ScoreFunction;
import io.github.repir.Strategy.Strategy;
import io.github.repir.tools.io.EOCException;
import io.github.repir.tools.lib.Log;
import io.github.repir.tools.io.struct.StructureReader;
import io.github.repir.tools.io.struct.StructureWriter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.TreeSet;

/**
 * Default collector used by {@link RetrievalModel} to retrieve a ranked list of documents. 
 * Each document is scored by the documentPrior + sum of Scorable features. 
 * The {@link Query} specifies the maximum number of documents retrieved, which can
 * usually be configured as "retriever.documentlimit". 
 * 

* @author jeroen */ public class CollectorDocument extends Collector { public static Log log = new Log(CollectorDocument.class); private Document sortedDocs[]; private Comparator documentcomparatorscore; private Comparator documentcomparator; private TreeSet collectedDocs; private ArrayList scorables; RetrievalModel retrievalmodel; ScoreFunction scorefunction; double lowestscore = Double.MAX_VALUE; int limit; int tailsize = 0; public CollectorDocument() { super(); } public CollectorDocument(RetrievalModel rm) { super(rm); } @Override public void setStrategy(Strategy strategy) { super.setStrategy(strategy); retrievalmodel = (RetrievalModel) strategy; documentcomparator = retrievalmodel.query.getDocumentComparator(); documentcomparatorscore = new DocumentComparatorScore(); collectedDocs = new TreeSet(documentcomparatorscore); } @Override public int hashCode() { return (strategy == null)?0:strategy.query.id; } @Override public boolean equals(Object r) { if (r instanceof CollectorDocument) { CollectorDocument rr = (CollectorDocument) r; return strategy.query.id == rr.strategy.query.id; } return false; } @Override public Collection getReducerIDs() { return strategy.query.getReducerID(); } @Override public String getReducerName() { return Integer.toString(strategy.query.id); } @Override public void prepareRetrieval() { for (ReportedFeature f : retrievalmodel.getReportedUnstoredFeatures()) { f.feature.prepareRetrieval(strategy); } scorables = retrievalmodel.root.getAnnounce(ANNOUNCEKEY.SCORABLE); HashSet list = new HashSet(); for (Operator n : scorables) { while (n.parent != retrievalmodel.root) { n = (Operator) n.parent; } list.add(n); } for (Operator n : list) { n.doPrepareRetrieval(); } scorefunction = ScoreFunction.create(retrievalmodel.root); scorefunction.prepareRetrieval(retrievalmodel, retrievalmodel.root.getAnnounce(ANNOUNCEKEY.SCORABLE)); } @Override public boolean reduceInQuery() { return true; } public String getKey() { return Integer.toString(strategy.query.id); } public void setCollectedResults() { } @Override public void prepareAggregation() { super.prepareAggregation(); containedfeatures = retrievalmodel.root.containednodes; this.limit = retrievalmodel.getDocumentLimit(); } @Override public void collectDocument(Document doc) { //log.s("collectDocument"); doc.score += scorefunction.score(doc); if (doc.score != 0) { addDocumentRetrieval(doc); } //log.e("collectDocument"); } @Override public void reuse() { collectedDocs = new TreeSet(documentcomparatorscore); sortedDocs = new Document[0]; } /** * Add a document during document processing. The ReportedUnstoredFeatures * are reported. * * @param doc */ protected void addDocumentRetrieval(Document doc) { if (collectedDocs.size() >= limit) { if (doc.score > lowestscore) { collectedDocs.add(doc); if (collectedDocs.size() - tailsize >= limit) { for (int i = 0; i < tailsize; i++) collectedDocs.pollLast(); lowestscore = collectedDocs.last().score; setTailSize(); } } else if (doc.score == lowestscore) { collectedDocs.add(doc); tailsize++; } else { return; } } else { collectedDocs.add(doc); if (doc.score == lowestscore) { tailsize++; } else if (doc.score < lowestscore) { lowestscore = doc.score; tailsize = 1; } } for (ReportedFeature f : retrievalmodel.getReportedUnstoredFeatures()) { f.feature.report(doc, f.reportID); } } public void setTailSize() { tailsize = 0; Iterator iter = collectedDocs.descendingIterator(); Document d = iter.next(); while (iter.hasNext() && d.score == lowestscore) { tailsize++; d = iter.next(); } } @Override public void writeValue(StructureWriter rw) { rw.write(collectedDocs.size()); for (Document d : collectedDocs) { d.write(rw); } } @Override public void readValue(StructureReader reader) { try { int doccount = reader.readInt(); collectedDocs = null; sortedDocs = new Document[doccount]; for (int d = 0; d < doccount; d++) { Document doc = new Document(); doc.read(reader); sortedDocs[d] = doc; } } catch (EOCException ex) { log.fatalexception(ex, "read( %s )", reader); } } public Document[] getRetrievedDocs() { return sortedDocs; } @Override public void finishSegmentRetrieval() { sortedDocs = collectedDocs.toArray(new Document[collectedDocs.size()]); } @Override public void postLoadFeatures(int partition) { if (!retrievalmodel.root.needsPrePass()) { retriever.readReportedStoredFeatures(collectedDocs, retrievalmodel.getReportedStoredFeatures(), partition); if (collectedDocs.size() > limit) { TreeSet collectedDocs = new TreeSet(documentcomparator); collectedDocs.addAll(this.collectedDocs); this.collectedDocs = collectedDocs; while (collectedDocs.size() > limit) collectedDocs.pollLast(); } } } @Override public void aggregate(Collector subcollector) { CollectorDocument cd = (CollectorDocument) subcollector; if (this.sortedDocs == null || sortedDocs.length == 0) { if (cd.sortedDocs != null) { sortedDocs = cd.sortedDocs; for (Document d : sortedDocs) { d.setRetrievalModel(retrievalmodel); } } } else if (cd.sortedDocs != null && cd.sortedDocs.length > 0) { Document newlist[] = new Document[Math.min(sortedDocs.length + cd.sortedDocs.length, retrievalmodel.getDocumentLimit())]; int p2 = 0, p3 = 0; for (int p1 = 0; p1 < newlist.length; p1++) { if (documentcomparator.compare(sortedDocs[p2], cd.sortedDocs[p3]) < 0) { newlist[p1] = sortedDocs[p2]; if (++p2 == sortedDocs.length) { for (p1++; p1 < newlist.length; p1++) { newlist[p1] = cd.sortedDocs[p3++]; newlist[p1].setRetrievalModel(retrievalmodel); } } } else { newlist[p1] = cd.sortedDocs[p3]; newlist[p1].setRetrievalModel(retrievalmodel); if (++p3 == cd.sortedDocs.length) { for (p1++; p1 < newlist.length; p1++) { newlist[p1] = sortedDocs[p2++]; } } } } sortedDocs = newlist; } } @Override public void decode() { for (Document doc : sortedDocs) { doc.setRetrievalModel(retrievalmodel); doc.decode(); } } @Override public void writeID(StructureWriter writer) { writer.write(strategy.query.id); } @Override public void writeKey(StructureWriter writer) { } @Override public void readKey(StructureReader reader) throws EOCException { } @Override public void readID(StructureReader reader) throws EOCException { reader.readInt(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy