io.github.repir.Strategy.RetrievalModel Maven / Gradle / Ivy
The newest version!
package io.github.repir.Strategy;
import io.github.repir.Repository.Feature;
import io.github.repir.Repository.ReportableFeature;
import io.github.repir.Repository.ReportedUnstoredFeature;
import java.lang.reflect.Constructor;
import io.github.repir.Retriever.Document;
import io.github.repir.Retriever.Retriever;
import io.github.repir.Retriever.PostingIterator;
import io.github.repir.Retriever.Query;
import io.github.repir.tools.lib.Log;
import io.github.repir.Repository.StoredFeature;
import io.github.repir.Repository.StoredReportableFeature;
import io.github.repir.Retriever.ReportedFeature;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import io.github.repir.Strategy.Collector.Collector;
import io.github.repir.Strategy.Collector.CollectorDocument;
import io.github.repir.tools.lib.ClassTools;
/**
* A RetrievalModel is a Strategy that eventually returns the query with a list
* of ranked {@link Document}s.
*
* Similar to a {@link Strategy}, the {@link RetrievalModel} contains the Query
* independent logic to control retrieval, e.g. the basic operation for pseudo
* relevance feedback or Sequential Dependence Model. During retrieval, the
* retrieval model instance controls the retrieval operation for a single pass
* on a partition for a single query. All used {@link Feature}s are managed by
* the {@link RetrievalModel} for a pass; if {@link Operator}s need a
* {@link StoredFeature} they should request this with
* {@link RetrievalModel#requestFeature(java.lang.Class, java.lang.String[])}
* and the reported features that are specified in the {@link Query} are
* instantiated by the RetrievalModel and can be obtained with f.i.
* {@link #getReportableFeatures()}.
*
* To allow for multi-pass retrieval, a {@link Retriever} will receive the
* resulting query after each pass and retrieve again until the Query contains
* results. A RetrievalModel distinguished itself from a {@link Strategy} by
* constructing a graph of {@link Operator} nodes, that are used to process each
* {@link Document}. The nodes should request the features they need using {@link #requestFeature(java.lang.Class, java.lang.String[])
* }
* and by default the RetrievalModel will internally use a
* {@link PostingIterator} to efficiently retrieve the feature values for each
* {@link Document}. On the final pass, a RetrievaModel will use a
* {@link DocumentCollector} to collect a ranked list of
* "retriever.documentlimit" {@link Document}s. Retrieval for a partition is
* executed by running {@link #doMapTask()}, in which the RetrievalModel
* iterates over the retrieved {@link Document}s, calls the highest operator
* nodes in the graph to have all operators set their values corresponding the
* current document, and call for the {@link Collector} to {@link Collector#collectDocument(io.github.repir.Retriever.Document)
* }
* the document. The default {@link DocumentCollector} will use the configured
* "retriever.scorefunction" to assign a score to the document, and maintain a
* list of the top ranked documents.
*
* used {@link Feature}s.
*
* @author jeroen
*/
public class RetrievalModel extends Strategy {
public static Log log = new Log(RetrievalModel.class);
public ArrayList requestedfeatures = new ArrayList();
public GraphRoot root;
private Class documentcollectorclass = CollectorDocument.class;
// requested features
public HashMap featuresmap;
private ArrayList> reportableunstoredfeatures;
private ArrayList> storedreportablefeatures;
private ArrayList> reportablefeatures;
/**
* Use {@link #create(Retriever.Retriever, Retriever.Query)} instead.
*
* @param retriever
*/
public RetrievalModel(Retriever retriever) {
super(retriever);
}
public static RetrievalModel create(Retriever retriever, Query queryrequest) {
return (RetrievalModel) create(retriever, queryrequest, RetrievalModel.class);
}
public final void buildGraph() {
root = new GraphRoot(this);
root.buildGraph();
}
/**
* If an {@link Operator} needs a {@link StoredFeature} for operation, to retrieve
* data for the inspected {@link Document}s, they should obtain this using this
* method, which is then automatically added to the {@link PostingIterator}.
* @param clazz class that extends {@link StoredFeature}
* @param parameter optional parameter list, e.g. for a {@link Term} that requires
* its postings list stored in {@link TermInverted}, the parameters are term and
* channel.
* @return The {@link StoredFeature} that was requested.
*/
public void requestFeature(StoredFeature feature) {
if (!requestedfeatures.contains(feature))
requestedfeatures.add(feature);
}
/**
* @return list of all {@link StoredFeature}s requested using {@link #requestFeature(java.lang.Class, java.lang.String[]) }
*/
public Collection getUsedFeatures() {
return requestedfeatures;
}
/**
* By default, a retrieval model uses a {@link CollectorDocument} to retrieve
* a ranked list of {@link Document}s. RetrievalModels that require a pre-pass
* should override this to use different collector for the retrieval model.
* Note: this method is not called when an {@link Operator} flagged that a
* pre-pass is needed, in which case the {@link Operator} constructs its own
* custom {@link Collector} to obtain the data, and no end result is obtained during
* that pass.
*/
@Override
public void setCollector() {
if (!root.needsPrePass()) {
Constructor constructor = ClassTools.getConstructor(documentcollectorclass, RetrievalModel.class);
ClassTools.construct(constructor, this);
}
}
/**
* Used to setup the Strategy so that results can be collected and aggregated
* but are not retrieved yet. This is typically used in the Reducer to create
* a Strategy for the aggregation of results collected per segments.
*/
@Override
public void prepareAggregationDetail() {
buildGraph();
root.prepareRetrieval();
}
/**
* Retrieves and processes the results for a single partition. This is
* typically used in Mappers that only process the results for a single
* partition and send the results to the reducer.
*
* @param partition id of the partition to retrieveQueries
* @return the MasterCollector containing the aggregated results of all
* segments.=
*/
@Override
public void doMapTask() {
if (root.containednodes.size() > 0) {
PostingIterator pi = retriever.getPostingIterator(this, partition);
for (Document d = pi.next(); d != null; d = pi.next()) {
collectors.collect(d);
}
}
}
/**
* After a retrieval-pass, the {@link Retriever} calls the results() function
* in which the Strategy decides whether this was the final pass and results
* are returned, or if a consecutive retrieval pass is required.
*
* @return {@link Query} object, which contains strategyclass=null and
* queryresults if the final pass was processed, or a strategyclass with a
* reformulated query if a consecutive retrieval pass is required.
*/
public Query finishReduceTask() {
if (root.needsPrePass()) {
//log.info("cascade needed prepass");
query.query = root.postReform();
query.setStrategyClassname(RetrievalModel.class.getCanonicalName());
} else {
Collector c = collectors.getCollector(documentcollectorclass.getSimpleName());
if (c != null) {
query.setStrategyClassname(null);
if (collectors.size() > 0) {
for (Document d : ((CollectorDocument) c).getRetrievedDocs())
query.add(d);
}
}
}
return query;
}
/**
* @return reformulated {@link Query} after a retrieval pass. This can be used
* by multi-pass retrieval models such as Pseudo Relevance Feedback to reformulate
* a Query based on retrieved results.
*/
public Query postReform() {
query.query = root.postReform();
return query;
}
/**
* low level constructor to create a new document object
*
* @param terms the number of terms in the query, to initialize the arrays
* hat contain the statistics per term.
* @return a Document object
*/
public Document createDocument(int id, int partition) {
try {
return query.createDocument(this, id, partition);
} catch (Exception ex) {
log.fatalexception(ex, "createDocument( %d, %d )", id, partition);
return null;
}
}
/**
* @return a Map containing all {@link ReportedFeature}s that are specified in
* the {@link Query}.
*/
public HashMap getReportedFeaturesMap() {
if (featuresmap == null) {
featuresmap = new HashMap();
for (String featurename : getReportedFeatures()) {
ReportedFeature f = new ReportedFeature(featurename, (ReportableFeature) repository.getFeature(featurename));
f.reportID = featuresmap.size();
featuresmap.put(featurename, f);
}
}
return featuresmap;
}
/**
* @return The {@link Feature}s that were specified in the {@link Query}, to report
* back for each {@link Document}. This method is overridden by retrieval models
* that implement multi-pass retrieval, to prevent fetching features that are only
* needed in the final pass and alternatively to fetch features needed after the
* pre-pass, e.g. Pseduo Relevance Feedback models, which sometimes need the
* size of the document's.
*/
public ArrayList getReportedFeatures() {
return query.reportedFeatures;
}
/**
* @return a Collection of {@link ReportedFeature}s.
*/
public Collection> getReportableFeatures() {
if (reportablefeatures == null) {
reportablefeatures = new ArrayList>();
for (ReportedFeature f : getReportedFeaturesMap().values()) {
reportablefeatures.add(new ReportedFeature(f));
}
}
return reportablefeatures;
}
/**
* @return The (int) position of the specified {@link ReportableFeature}, which is
* used internally to assign the features data to a slot in each {@link Document}s
* reporteddata array.
*/
public int getReportID(ReportableFeature f) {
ReportedFeature rf = getReportedFeaturesMap().get(f.getCanonicalName());
return (rf == null) ? -1 : rf.reportID;
}
/**
* @return Collection of {@link ReportedFeature}s that are stored, i.e. these
* are not needed to score {@link Document}s, but only have to be retrieved
* for the documents in the final ranked list.
*/
public Collection> getReportedStoredFeatures() {
if (storedreportablefeatures == null) {
storedreportablefeatures = new ArrayList>();
for (ReportedFeature f : getReportedFeaturesMap().values()) {
if (f.feature instanceof StoredReportableFeature) {
storedreportablefeatures.add(new ReportedFeature(f));
}
}
}
return storedreportablefeatures;
}
/**
* @return Collection of {@link ReportedFeature}s that are not stored, but
* created during processing of the graph for each document, and therefore have
* to be stored during graph processing. A typical example is a secondary
* {@link ScoreFunction}, which is calculated but not stored as the primary score
* used to rank the documents.
*/
public Collection> getReportedUnstoredFeatures() {
if (reportableunstoredfeatures == null) {
reportableunstoredfeatures = new ArrayList>();
for (ReportedFeature f : getReportedFeaturesMap().values()) {
if (f.feature instanceof ReportedUnstoredFeature) {
reportableunstoredfeatures.add(new ReportedFeature(f));
}
}
}
return reportableunstoredfeatures;
}
public ReportedFeature getReportedFeature(Class c, String... parameters) {
ReportedFeature rf = getReportedFeaturesMap().get(Feature.canonicalName(c, parameters));
return (rf != null) ? rf : null;
}
public ReportedFeature getReportedFeature(ReportableFeature f) {
ReportedFeature rf = getReportedFeaturesMap().get(f.getCanonicalName());
return (rf != null) ? rf : null;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy