io.github.repir.Retriever.MapReduce.Retriever Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of RepIR Show documentation
The newest version!
package io.github.repir.Retriever.MapReduce;

import io.github.repir.Repository.Repository;
import io.github.repir.Retriever.Query;
import io.github.repir.tools.lib.Log;
import io.github.repir.Strategy.Strategy;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import io.github.repir.Strategy.Operator.Analyzer;
import io.github.repir.Strategy.RetrievalModelAnalyze;

/**
 * An implementation of Retriever that retrieves queries using the MapReduce
 * framework. After each pass, queries that are complete (i.e. there is no
 * consecutive Strategy to run) are stored in the finalresults, and queries that
 * require an additional retrieval pass are automatically resubmitted to the
 * MapReduce framework. From the user's perspective, a Query is given, and the
 * endresult of the final run per Query is returned.
 * 
 * @author jeroen
 */
public class Retriever extends io.github.repir.Retriever.Retriever {

   public static Log log = new Log(Retriever.class);
   protected Mapper.Context mappercontext;
   protected Reducer.Context reducercontext;
   private long lastprogress;
   protected RetrieverJob currentjob;
   protected ArrayList models;
   protected ArrayList finalresults;
   protected String jobpath;

   public Retriever(Repository repository) {
      super(repository);
   }

   /**
    * The Mapper context is used to report progress, to prevent processes form
    * being killed while still working.
    * 

    * @param repository
    * @param mappercontext
    */
   public Retriever(Repository repository, org.apache.hadoop.mapreduce.Mapper.Context mappercontext) {
      super(repository);
      this.mappercontext = mappercontext;
   }

   /**
    * The Reducer context is used to report progress, to prevent processes form
    * being killed while still aggregating.
    * 

    * @param repository
    * @param reducercontext
    */
   public Retriever(Repository repository, org.apache.hadoop.mapreduce.Reducer.Context reducercontext) {
      super(repository);
      this.reducercontext = reducercontext;
   }

   public RetrieverJob createJob(String path) throws IOException {
      return new RetrieverJob(this, path);
   }

   /**
    * The List of queries are retrieved using the MapReduce framework.
    * 

    * @param queries
    * @return retrieved list of queries
    */
   @Override
   public ArrayList retrieveQueries(ArrayList queries) {
      models = new ArrayList();
      finalresults = new ArrayList();
      String path = null;
      try {
         for (Query q : queries) {
            models.add(Strategy.create(this, q));
         }
         removeDoneQueries();
         while (models.size() > 0) {
            RetrieverJob job = createJob(path);
            path = job.path;
            addQueriesToJob(job, models);
            repository.featuresWriteCache();
            Collection results = job.getResults();
            models = resultToModel(results, models);
            removeDoneQueries();
            if (models.size() > 0) {
               repository.readConfiguration();
            }
         }
      } catch (IOException ex) {
         log.exception(ex, "retrieveQueries( %s )", queries);
      }
      return finalresults;
   }

   public ArrayList recoverQueries(ArrayList queries, String path) {
      ArrayList models = new ArrayList();
      ArrayList finalresults = new ArrayList();
      try {
         for (Query q : queries) {
            models.add(Strategy.create(this, q));
         }
         if (models.size() > 0) {
            RetrieverJob job = createJob(path);
            addQueriesToJob(job, models);
            Collection results = job.recoverResults();
            models = resultToModel(results, models);
         }
         Iterator iter = models.iterator();
         while (iter.hasNext()) {
            Strategy rm = iter.next();
            if (rm.query.done()) {
               finalresults.add(rm.query);
               iter.remove();
            }
         }
         if (models.size() > 0) {
            log.fatal("not all results were collected");
         }
      } catch (IOException ex) {
         log.exception(ex, "retrieveQueries( %s )", queries);
      }
      return finalresults;
   }

   protected void addQueriesToJob(RetrieverJob job, ArrayList models) {
      ArrayList queries = new ArrayList();
      for (Strategy rm : models) {
         queries.add(rm.query);
      }
      job.setQueries(queries);
   }

   protected ArrayList resultToModel(Collection results, ArrayList models) {
      ArrayList newmodels = new ArrayList();
      NEXT:
      for (Strategy rm : models) {
         for (Query q : results) {
            if (rm.query.id == q.id) {
               rm.query = q;
               if (q.done()) {
                  rm.query = q;
                  newmodels.add(rm);
               } else {
                  newmodels.add(Strategy.create(this, q));
               }
               continue NEXT;
            }
         }
         if (!(rm instanceof Analyzer) && !(rm instanceof RetrievalModelAnalyze)) {
            newmodels.add(rm);
         }
      }
      return newmodels;
   }

   /**
    * For the retrieval of large results sets that may not fit into memory, this
    * retrieves a list of queries, and returns an {@link QueueIterator} that
    * allows to read the returned results as a stream. Each value of the stream
    * returns a QueryIterator
    * 

    * @param queries
    * @return
    */
   public QueueIterator retrieveQueueIterator(ArrayList queries) {
      QueueIterator fqi = null;
      String path = null;
      try {
         ArrayList finalresults = new ArrayList();
         ArrayList results;
         if (queries.size() > 0) {
            repository.readConfiguration(); // have to reread in case values were added to the repository
            RetrieverJob job = createJob(path);
            path = job.path;
            job.setQueries(queries);
            queries = new ArrayList();
            fqi = job.getQueueIterator();
         }
      } catch (IOException ex) {
         log.exception(ex, "retrieveQueueIterator( %s )", queries);
      }
      return fqi;
   }

   public void doJobDontWait(ArrayList queries) {
      doJob(queries, false);
   }

   public void doJobDontWait(final Query q) {
      doJobDontWait(new ArrayList() {{ add(q); }} );
   }

   public void doJob(ArrayList queries) {
      doJob(queries, true);
   }

   public void doJob(final Query q) {
      doJob(new ArrayList() {{ add(q); }} );
   }

   private void doJob(ArrayList queries, boolean wait) {
      try {
         if (queries.size() > 0) {
            repository.readConfiguration(); // have to reread in case values were added to the repository
            RetrieverJob job = createJob("");
            job.setQueries(queries);
            job.doJob(wait);
         } else {
            log.info("no queries");
         }
      } catch (IOException ex) {
         log.exception(ex, "doJob( %s )", queries);
      }
   }

   /**
    * For the retrieval of large results sets that may not fit into memory, this
    * retrieves the queries in the queue, and returns an {@link QueueIterator}
    * that allows to read the returned results as a stream. Each value of the
    * stream returns a QueryIterator
    * 

    * @return An Iterator that allows to read the results as a stream.
    */
   public QueueIterator retrieveQueueIterator() {
      return this.retrieveQueueIterator(getQueue());
   }

   @Override
   public void mapperProgress() {
      if (mappercontext != null && System.currentTimeMillis() - lastprogress > 300000) {
         mappercontext.progress();
         lastprogress = System.currentTimeMillis();
      }
   }

   @Override
   public void reducerProgress() {
      if (reducercontext != null && System.currentTimeMillis() - lastprogress > 300000) {
         reducercontext.progress();
         lastprogress = System.currentTimeMillis();
      }
   }

   /**
    * Retrieves a single query described in the Query object using the Mapreduce
    * framework.
    * 
    * @param q the string that represents the user's need
    * @return the passed Query object is expanded with the retrieved documents
    */
   @Override
   public Query retrieveQuery(Query q) { // implements physical retrieval strategy
      ArrayList queries = new ArrayList();
      queries.add(q);
      ArrayList results = retrieveQueries(queries);
      return results.get(0);
   }

   protected void removeDoneQueries() {
      Iterator iter = models.iterator();
      while (iter.hasNext()) {
         Strategy rm = iter.next();
         if (rm.query.done()) {
            finalresults.add(rm.query);
            iter.remove();
         }
      }
   }
}