All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.github.repir.Retriever.MapReduce.Retriever Maven / Gradle / Ivy

The newest version!
package io.github.repir.Retriever.MapReduce;

import io.github.repir.Repository.Repository;
import io.github.repir.Retriever.Query;
import io.github.repir.tools.lib.Log;
import io.github.repir.Strategy.Strategy;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import io.github.repir.Strategy.Operator.Analyzer;
import io.github.repir.Strategy.RetrievalModelAnalyze;

/**
 * An implementation of Retriever that retrieves queries using the MapReduce
 * framework. After each pass, queries that are complete (i.e. there is no
 * consecutive Strategy to run) are stored in the finalresults, and queries that
 * require an additional retrieval pass are automatically resubmitted to the
 * MapReduce framework. From the user's perspective, a Query is given, and the
 * endresult of the final run per Query is returned.
 * 

* @author jeroen */ public class Retriever extends io.github.repir.Retriever.Retriever { public static Log log = new Log(Retriever.class); protected Mapper.Context mappercontext; protected Reducer.Context reducercontext; private long lastprogress; protected RetrieverJob currentjob; protected ArrayList models; protected ArrayList finalresults; protected String jobpath; public Retriever(Repository repository) { super(repository); } /** * The Mapper context is used to report progress, to prevent processes form * being killed while still working. *

* @param repository * @param mappercontext */ public Retriever(Repository repository, org.apache.hadoop.mapreduce.Mapper.Context mappercontext) { super(repository); this.mappercontext = mappercontext; } /** * The Reducer context is used to report progress, to prevent processes form * being killed while still aggregating. *

* @param repository * @param reducercontext */ public Retriever(Repository repository, org.apache.hadoop.mapreduce.Reducer.Context reducercontext) { super(repository); this.reducercontext = reducercontext; } public RetrieverJob createJob(String path) throws IOException { return new RetrieverJob(this, path); } /** * The List of queries are retrieved using the MapReduce framework. *

* @param queries * @return retrieved list of queries */ @Override public ArrayList retrieveQueries(ArrayList queries) { models = new ArrayList(); finalresults = new ArrayList(); String path = null; try { for (Query q : queries) { models.add(Strategy.create(this, q)); } removeDoneQueries(); while (models.size() > 0) { RetrieverJob job = createJob(path); path = job.path; addQueriesToJob(job, models); repository.featuresWriteCache(); Collection results = job.getResults(); models = resultToModel(results, models); removeDoneQueries(); if (models.size() > 0) { repository.readConfiguration(); } } } catch (IOException ex) { log.exception(ex, "retrieveQueries( %s )", queries); } return finalresults; } public ArrayList recoverQueries(ArrayList queries, String path) { ArrayList models = new ArrayList(); ArrayList finalresults = new ArrayList(); try { for (Query q : queries) { models.add(Strategy.create(this, q)); } if (models.size() > 0) { RetrieverJob job = createJob(path); addQueriesToJob(job, models); Collection results = job.recoverResults(); models = resultToModel(results, models); } Iterator iter = models.iterator(); while (iter.hasNext()) { Strategy rm = iter.next(); if (rm.query.done()) { finalresults.add(rm.query); iter.remove(); } } if (models.size() > 0) { log.fatal("not all results were collected"); } } catch (IOException ex) { log.exception(ex, "retrieveQueries( %s )", queries); } return finalresults; } protected void addQueriesToJob(RetrieverJob job, ArrayList models) { ArrayList queries = new ArrayList(); for (Strategy rm : models) { queries.add(rm.query); } job.setQueries(queries); } protected ArrayList resultToModel(Collection results, ArrayList models) { ArrayList newmodels = new ArrayList(); NEXT: for (Strategy rm : models) { for (Query q : results) { if (rm.query.id == q.id) { rm.query = q; if (q.done()) { rm.query = q; newmodels.add(rm); } else { newmodels.add(Strategy.create(this, q)); } continue NEXT; } } if (!(rm instanceof Analyzer) && !(rm instanceof RetrievalModelAnalyze)) { newmodels.add(rm); } } return newmodels; } /** * For the retrieval of large results sets that may not fit into memory, this * retrieves a list of queries, and returns an {@link QueueIterator} that * allows to read the returned results as a stream. Each value of the stream * returns a QueryIterator *

* @param queries * @return */ public QueueIterator retrieveQueueIterator(ArrayList queries) { QueueIterator fqi = null; String path = null; try { ArrayList finalresults = new ArrayList(); ArrayList results; if (queries.size() > 0) { repository.readConfiguration(); // have to reread in case values were added to the repository RetrieverJob job = createJob(path); path = job.path; job.setQueries(queries); queries = new ArrayList(); fqi = job.getQueueIterator(); } } catch (IOException ex) { log.exception(ex, "retrieveQueueIterator( %s )", queries); } return fqi; } public void doJobDontWait(ArrayList queries) { doJob(queries, false); } public void doJobDontWait(final Query q) { doJobDontWait(new ArrayList() {{ add(q); }} ); } public void doJob(ArrayList queries) { doJob(queries, true); } public void doJob(final Query q) { doJob(new ArrayList() {{ add(q); }} ); } private void doJob(ArrayList queries, boolean wait) { try { if (queries.size() > 0) { repository.readConfiguration(); // have to reread in case values were added to the repository RetrieverJob job = createJob(""); job.setQueries(queries); job.doJob(wait); } else { log.info("no queries"); } } catch (IOException ex) { log.exception(ex, "doJob( %s )", queries); } } /** * For the retrieval of large results sets that may not fit into memory, this * retrieves the queries in the queue, and returns an {@link QueueIterator} * that allows to read the returned results as a stream. Each value of the * stream returns a QueryIterator *

* @return An Iterator that allows to read the results as a stream. */ public QueueIterator retrieveQueueIterator() { return this.retrieveQueueIterator(getQueue()); } @Override public void mapperProgress() { if (mappercontext != null && System.currentTimeMillis() - lastprogress > 300000) { mappercontext.progress(); lastprogress = System.currentTimeMillis(); } } @Override public void reducerProgress() { if (reducercontext != null && System.currentTimeMillis() - lastprogress > 300000) { reducercontext.progress(); lastprogress = System.currentTimeMillis(); } } /** * Retrieves a single query described in the Query object using the Mapreduce * framework. *

* @param q the string that represents the user's need * @return the passed Query object is expanded with the retrieved documents */ @Override public Query retrieveQuery(Query q) { // implements physical retrieval strategy ArrayList queries = new ArrayList(); queries.add(q); ArrayList results = retrieveQueries(queries); return results.get(0); } protected void removeDoneQueries() { Iterator iter = models.iterator(); while (iter.hasNext()) { Strategy rm = iter.next(); if (rm.query.done()) { finalresults.add(rm.query); iter.remove(); } } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy