io.github.repir.Strategy.RetrievalModel Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of RepIR Show documentation
The newest version!
package io.github.repir.Strategy;

import io.github.repir.Repository.Feature;
import io.github.repir.Repository.ReportableFeature;
import io.github.repir.Repository.ReportedUnstoredFeature;
import java.lang.reflect.Constructor;
import io.github.repir.Retriever.Document;
import io.github.repir.Retriever.Retriever;
import io.github.repir.Retriever.PostingIterator;
import io.github.repir.Retriever.Query;
import io.github.repir.tools.lib.Log;
import io.github.repir.Repository.StoredFeature;
import io.github.repir.Repository.StoredReportableFeature;
import io.github.repir.Retriever.ReportedFeature;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import io.github.repir.Strategy.Collector.Collector;
import io.github.repir.Strategy.Collector.CollectorDocument;
import io.github.repir.tools.lib.ClassTools;

/**
 * A RetrievalModel is a Strategy that eventually returns the query with a list
 * of ranked {@link Document}s.
 * 
 * Similar to a {@link Strategy}, the {@link RetrievalModel} contains the Query
 * independent logic to control retrieval, e.g. the basic operation for pseudo
 * relevance feedback or Sequential Dependence Model. During retrieval, the
 * retrieval model instance controls the retrieval operation for a single pass
 * on a partition for a single query. All used {@link Feature}s are managed by
 * the {@link RetrievalModel} for a pass; if {@link Operator}s need a
 * {@link StoredFeature} they should request this with
 * {@link RetrievalModel#requestFeature(java.lang.Class, java.lang.String[])}
 * and the reported features that are specified in the {@link Query} are
 * instantiated by the RetrievalModel and can be obtained with f.i.
 * {@link #getReportableFeatures()}.
 * 

 * To allow for multi-pass retrieval, a {@link Retriever} will receive the
 * resulting query after each pass and retrieve again until the Query contains
 * results. A RetrievalModel distinguished itself from a {@link Strategy} by
 * constructing a graph of {@link Operator} nodes, that are used to process each
 * {@link Document}. The nodes should request the features they need using {@link #requestFeature(java.lang.Class, java.lang.String[])
 * }
 * and by default the RetrievalModel will internally use a
 * {@link PostingIterator} to efficiently retrieve the feature values for each
 * {@link Document}. On the final pass, a RetrievaModel will use a
 * {@link DocumentCollector} to collect a ranked list of
 * "retriever.documentlimit" {@link Document}s. Retrieval for a partition is
 * executed by running {@link #doMapTask()}, in which the RetrievalModel
 * iterates over the retrieved {@link Document}s, calls the highest operator
 * nodes in the graph to have all operators set their values corresponding the
 * current document, and call for the {@link Collector} to {@link Collector#collectDocument(io.github.repir.Retriever.Document)
 * }
 * the document. The default {@link DocumentCollector} will use the configured
 * "retriever.scorefunction" to assign a score to the document, and maintain a
 * list of the top ranked documents.
 * 

 * used {@link Feature}s.
 *
 * @author jeroen
 */
public class RetrievalModel extends Strategy {

   public static Log log = new Log(RetrievalModel.class);
   public ArrayList requestedfeatures = new ArrayList();
   public GraphRoot root;
   private Class documentcollectorclass = CollectorDocument.class;
   // requested features
   public HashMap featuresmap;
   private ArrayList> reportableunstoredfeatures;
   private ArrayList> storedreportablefeatures;
   private ArrayList> reportablefeatures;

   /**
    * Use {@link #create(Retriever.Retriever, Retriever.Query)} instead.
    * 

    * @param retriever
    */
   public RetrievalModel(Retriever retriever) {
      super(retriever);
   }

   public static RetrievalModel create(Retriever retriever, Query queryrequest) {
      return (RetrievalModel) create(retriever, queryrequest, RetrievalModel.class);
   }

   public final void buildGraph() {
      root = new GraphRoot(this);
      root.buildGraph();
   }

   /**
    * If an {@link Operator} needs a {@link StoredFeature} for operation, to retrieve
    * data for the inspected {@link Document}s, they should obtain this using this
    * method, which is then automatically added to the {@link PostingIterator}.
    * @param clazz class that extends {@link StoredFeature}
    * @param parameter optional parameter list, e.g. for a {@link Term} that requires
    * its postings list stored in {@link TermInverted}, the parameters are term and
    * channel.
    * @return The {@link StoredFeature} that was requested.
    */
   public void requestFeature(StoredFeature feature) {
      if (!requestedfeatures.contains(feature))
         requestedfeatures.add(feature);
   }

   /**
    * @return list of all {@link StoredFeature}s requested using {@link #requestFeature(java.lang.Class, java.lang.String[]) } 
    */
   public Collection getUsedFeatures() {
      return requestedfeatures;
   }

   /**
    * By default, a retrieval model uses a {@link CollectorDocument} to retrieve
    * a ranked list of {@link Document}s. RetrievalModels that require a pre-pass
    * should override this to use different collector for the retrieval model.
    * Note: this method is not called when an {@link Operator} flagged that a 
    * pre-pass is needed, in which case the {@link Operator} constructs its own
    * custom {@link Collector} to obtain the data, and no end result is obtained during 
    * that pass.
    */
   @Override
   public void setCollector() {
      if (!root.needsPrePass()) {
         Constructor constructor = ClassTools.getConstructor(documentcollectorclass, RetrievalModel.class);
         ClassTools.construct(constructor, this);
      }
   }

   /**
    * Used to setup the Strategy so that results can be collected and aggregated
    * but are not retrieved yet. This is typically used in the Reducer to create
    * a Strategy for the aggregation of results collected per segments.
    */
   @Override
   public void prepareAggregationDetail() {
      buildGraph();
      root.prepareRetrieval();
   }

   /**
    * Retrieves and processes the results for a single partition. This is
    * typically used in Mappers that only process the results for a single
    * partition and send the results to the reducer.
    * 

    * @param partition id of the partition to retrieveQueries
    * @return the MasterCollector containing the aggregated results of all
    * segments.=
    */
   @Override
   public void doMapTask() {
      if (root.containednodes.size() > 0) {
         PostingIterator pi = retriever.getPostingIterator(this, partition);
         for (Document d = pi.next(); d != null; d = pi.next()) {
            collectors.collect(d);
         }
      }
   }

   /**
    * After a retrieval-pass, the {@link Retriever} calls the results() function
    * in which the Strategy decides whether this was the final pass and results
    * are returned, or if a consecutive retrieval pass is required.
    * 

    * @return {@link Query} object, which contains strategyclass=null and
    * queryresults if the final pass was processed, or a strategyclass with a
    * reformulated query if a consecutive retrieval pass is required.
    */
   public Query finishReduceTask() {
      if (root.needsPrePass()) {
         //log.info("cascade needed prepass");
         query.query = root.postReform();
         query.setStrategyClassname(RetrievalModel.class.getCanonicalName());
      } else {
         Collector c = collectors.getCollector(documentcollectorclass.getSimpleName());
         if (c != null) {
            query.setStrategyClassname(null);
            if (collectors.size() > 0) {
                for (Document d : ((CollectorDocument) c).getRetrievedDocs())
                    query.add(d);
            }
         }
      }
      return query;
   }

   /**
    * @return reformulated {@link Query} after a retrieval pass. This can be used
    * by multi-pass retrieval models such as Pseudo Relevance Feedback to reformulate 
    * a Query based on retrieved results.
    */
   public Query postReform() {
      query.query = root.postReform();
      return query;
   }

   /**
    * low level constructor to create a new document object
    * 
    * @param terms the number of terms in the query, to initialize the arrays
    * hat contain the statistics per term.
    * @return a Document object
    */
   public Document createDocument(int id, int partition) {
      try {
         return query.createDocument(this, id, partition);
      } catch (Exception ex) {
         log.fatalexception(ex, "createDocument( %d, %d )", id, partition);
         return null;
      }
   }

   /**
    * @return a Map containing all {@link ReportedFeature}s that are specified in
    * the {@link Query}.
    */
   public HashMap getReportedFeaturesMap() {
      if (featuresmap == null) {
         featuresmap = new HashMap();
         for (String featurename : getReportedFeatures()) {
            ReportedFeature f = new ReportedFeature(featurename, (ReportableFeature) repository.getFeature(featurename));
            f.reportID = featuresmap.size();
            featuresmap.put(featurename, f);
         }
      }
      return featuresmap;
   }
   
   /**
    * @return The {@link Feature}s that were specified in the {@link Query}, to report
    * back for each {@link Document}. This method is overridden by retrieval models
    * that implement multi-pass retrieval, to prevent fetching features that are only
    * needed in the final pass and alternatively to fetch features needed after the
    * pre-pass, e.g. Pseduo Relevance Feedback models, which sometimes need the 
    * size of the document's. 
    */
   public ArrayList getReportedFeatures() {
      return query.reportedFeatures;
   }

   

   /**
    * @return a Collection of {@link ReportedFeature}s.
    */
   public Collection> getReportableFeatures() {
      if (reportablefeatures == null) {
         reportablefeatures = new ArrayList>();
         for (ReportedFeature f : getReportedFeaturesMap().values()) {
            reportablefeatures.add(new ReportedFeature(f));
         }
      }
      return reportablefeatures;
   }

   /**
    * @return The (int) position of the specified {@link ReportableFeature}, which is 
    * used internally to assign the features data to a slot in each {@link Document}s
    * reporteddata array.
    */
   public int getReportID(ReportableFeature f) {
      ReportedFeature rf = getReportedFeaturesMap().get(f.getCanonicalName());
      return (rf == null) ? -1 : rf.reportID;
   }

   /**
    * @return Collection of {@link ReportedFeature}s that are stored, i.e. these
    * are not needed to score {@link Document}s, but only have to be retrieved
    * for the documents in the final ranked list.
    */
   public Collection> getReportedStoredFeatures() {
      if (storedreportablefeatures == null) {
         storedreportablefeatures = new ArrayList>();
         for (ReportedFeature f : getReportedFeaturesMap().values()) {
            if (f.feature instanceof StoredReportableFeature) {
               storedreportablefeatures.add(new ReportedFeature(f));
            }
         }
      }
      return storedreportablefeatures;
   }

   /**
    * @return Collection of {@link ReportedFeature}s that are not stored, but 
    * created during processing of the graph for each document, and therefore have
    * to be stored during graph processing. A typical example is a secondary
    * {@link ScoreFunction}, which is calculated but not stored as the primary score
    * used to rank the documents.
    */
   public Collection> getReportedUnstoredFeatures() {
      if (reportableunstoredfeatures == null) {
         reportableunstoredfeatures = new ArrayList>();
         for (ReportedFeature f : getReportedFeaturesMap().values()) {
            if (f.feature instanceof ReportedUnstoredFeature) {
               reportableunstoredfeatures.add(new ReportedFeature(f));
            }
         }
      }
      return reportableunstoredfeatures;
   }

   public ReportedFeature getReportedFeature(Class c, String... parameters) {
      ReportedFeature rf = getReportedFeaturesMap().get(Feature.canonicalName(c, parameters));
      return (rf != null) ? rf : null;
   }

   public ReportedFeature getReportedFeature(ReportableFeature f) {
      ReportedFeature rf = getReportedFeaturesMap().get(f.getCanonicalName());
      return (rf != null) ? rf : null;
   }
}