All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.github.repir.Retriever.Retriever Maven / Gradle / Ivy

package io.github.repir.Retriever;

import io.github.repir.tools.extract.Content;
import io.github.repir.tools.extract.ExtractChannel;
import io.github.repir.tools.extract.ExtractorConf;
import io.github.repir.tools.extract.ExtractorQuery;
import io.github.repir.Repository.Repository;
import io.github.repir.Repository.ResidentFeature;
import io.github.repir.Repository.StoredReportableFeature;
import io.github.repir.Strategy.Collector.Collector;
import io.github.repir.Strategy.Collector.CollectorCachable;
import io.github.repir.Strategy.RetrievalModel;
import io.github.repir.Strategy.Strategy;
import io.github.repir.tools.collection.ArrayMap;
import io.github.repir.tools.collection.ArrayMap.Entry;
import io.github.repir.tools.lib.Log;
import io.github.repir.tools.Words.englishStemmer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Map;
import java.util.TreeSet;

/**
 * Gives access to an existing {@link Repository.Repository}. A retriever can 
 * execute a Strategy, possibly seeded by a Query with instructions on a 
 * repository. To maximize flexibility, the retrieval/analysis process has no 
 * knowledge of the features contained in the Repository, the Strategy, 
 * what the results are and how this are retrieved. The retriever rather manages
 * the retrieval process for a given Query, by instantiating the specified 
 * Strategy, and orchestrates the retrieval either standalone or over MapReduce.
 * The retriever will trigger a sequence of phases in which retrieval is performed 
 * by configurable components.
 * 

* The most common way to retrieveQueries a single Query, is to construct a * Query object using {@link #constructDefaultQuery(java.lang.String)} and call * {@link #retrieveQuery(Retriever.Query)}. The query object that is returned * contains the results. *

*/ public class Retriever { public static Log log = new Log(Retriever.class); public Repository repository; public ExtractorQuery extractor; public ArrayList queue = new ArrayList(); public static englishStemmer stemmer = englishStemmer.get(); /** * @param repository The Repository containing the location, filename, * requestedfeatures and statistics for the repository */ protected Retriever() { } /** * Setup an Retriever for a Repository. */ public Retriever(Repository repository) { this.repository = repository; } public Repository getRepository() { return repository; } final int mask4096 = (~4095) & 0x7FFFFFFF; /** * Optimize reading reported containedfeatures after retrieval. The documents * are sorted in physical order to enable sequential read. If the estimated * number of byte between the feature data is large, small random reads are * used, otherwise the file buffer is maximized as reading the whole thing is * faster than random reads. * * @param docs * @param containedfeatures */ public void readReportedStoredFeatures(Collection docs, Collection> features, int partition) { int MAXMEMORY = 100000000; TreeSet docids = new TreeSet(new Comparator() { public int compare(Document a, Document b) { return a.docid - b.docid; } }); docids.addAll(docs); int memoryleft = MAXMEMORY; ArrayMap sizes = new ArrayMap(); for (ReportedFeature f : features) { if (f.feature.partition != partition) { f.feature.setPartition(partition); if (f.feature instanceof ResidentFeature && !((ResidentFeature)f.feature).isReadResident()) sizes.add(f.feature.getLength(), f.feature); } else { memoryleft -= f.feature.getLength(); } } int featuresleft = features.size(); for (Map.Entry entry : sizes.descending()) { if (entry.getValue() instanceof ResidentFeature) if (entry.getKey() < memoryleft) { ((ResidentFeature)entry.getValue()).readResident(); memoryleft -= entry.getKey(); featuresleft--; } else { break; } } for (ReportedFeature f : features) { if (!(f.feature instanceof ResidentFeature) || !((ResidentFeature)f.feature).isReadResident()) { f.feature.getFile().setBufferSize((int)Math.min(50000000, f.feature.getLength())); f.feature.openRead(); } else { f.feature.reuse(); } for (Document d : docids) { f.feature.read(d); f.feature.report(d, f.reportID); } if (!(f.feature instanceof ResidentFeature) || !((ResidentFeature)f.feature).isReadResident()) { f.feature.closeRead(); } } } /** * a wrapper for {@link #retrieveQuery(Retriever.Query) } *

* @param query the string that represents the user's need * @return a Query object that contains the original query and the retrieved * documents */ public Query retrieveQuery(String query) { Query q = constructQueryRequest(0, query); return retrieveQuery(q); } /** * @param query a string that describes the documents to retrieveQueries. * @return a Query object containing the query string and the current * settings as the search parameters. */ public Query constructQueryRequest(String query) { return constructQueryRequest(0, query); } /** * @param query a string that describes the documents to retrieveQueries. * @return a Query object containing the query string and the current * settings as the search parameters. */ public Query constructQueryRequest(int id, String query) { Query q = new Query(repository, id, query); q.query = this.tokenizeString(query); return q; } public Strategy constructStrategy(Query q) { return Strategy.create(this, q); } /** * retrieves a single query described in the Query object. The Retrieval * Model decides what is retrieved (i.e. only document id's or if title, url, * etc. are included). If necessary use readDocuments to readValue the * documents meta data. *

* This function uses an iterative retrieval strategy that uses multi-pass * retrieval whenever the retrieval model for a query decides that a results * retrieval pass is necessary. By default, optional stemming and lowercasing * is only done on the first pass. *

* @param q the string that represents the user's need * @return the passed Query object is expanded with the retrieved documents */ public Query retrieveQuery(Query q) { // implements local retrieval strategy while (!q.done()) { Strategy retrievalmodel = constructStrategy(q); q = retrieveSinglePass(retrievalmodel); } return q; } public Query retrieveSinglePass(Strategy strategy) { // implements local retrieval strategy //log.info("retrieveSinglePass %d %s", repository.getPartitions(), strategy.query.query); strategy.prepareAggregation(); HashSet collected = new HashSet(); if (strategy instanceof RetrievalModel) { for (int i = 0; i < repository.getPartitions(); i++) { Strategy results = retrieveSegment(strategy.query, i); for (int collector = 0; collector < results.collectors.size(); collector++) { Collector aggregator = strategy.collectors.get(collector); aggregator.aggregate(results.collectors.get(collector)); collected.add(collector); } } } else { Strategy results = retrieveSegment(strategy.query, -1); strategy.collectors = results.collectors; for (Collector c : results.collectors) { collected.add(results.collectors.indexOf(c)); } } strategy.collectors.finishReduce(); for (Collector sdf : strategy.collectors) { if (sdf instanceof CollectorCachable) repository.unloadStoredDynamicFeature(((CollectorCachable)sdf).getStoredDynamicFeature()); } return strategy.finishReduceTask(); } /** * low level method that retrieves data from a single partition. Typically, * this method is used by Map-Reduce processes to retrieveQueries results on * one node. *

* @param query the query object that contains the request * @param partition the id of the repository partition * @return a set of collectors containing the retrieved results. */ public Strategy retrieveSegment(Query query, int segment) { //log.info("retrieveSegment( %d )", segment); Strategy strategy = constructStrategy(query); strategy.partition = segment; retrieveSegment(strategy); return strategy; } public void retrieveSegment(Strategy strategy) { strategy.prepareAggregation(); strategy.prepareRetrieval(); strategy.doMapTask(); strategy.collectors.postLoadFeatures(strategy.partition); strategy.collectors.finishSegmentRetrieval(); } public PostingIterator getPostingIterator(RetrievalModel strategy, int partition) { strategy.root.setTDFDependencies(); return new PostingIterator(strategy, partition); } /** * retrieves all queued queries. *

* @param queue A list of Queries to retrieveQueries * @return ArrayList with results for all queued queries. */ public ArrayList retrieveQueries(ArrayList queue) { for (Query q : queue) { Query retrieveQuery = this.retrieveQuery(q); } return queue; } public ArrayList retrieveQueue() { return retrieveQueries(queue); } public ExtractorConf getExtractor() { if (extractor == null) { extractor = new ExtractorQuery(repository.getConf()); } return extractor; } /** * tokenize the string in the query request. The configuration file is used * to configure the extractor similar to the one used for indexing, except * the removal of special query characters and long numbers. For stemming and * lowercasing the setting in the query object are used rather than the * repository configuration. FunctionNames: are not lowercased. *

* @param q the query request that contains the query string. * @return a String that contains the tokenized version. */ public String tokenizeString(String q) { getExtractor(); Content entity = new Content(); entity.content = q.getBytes(); extractor.process(entity); StringBuilder sb = new StringBuilder(); ArrayList finalterms = new ArrayList(); ExtractChannel channel = entity.get("rrquery"); for (String chunk : channel) { char last = sb.length() == 0 ? 0 : sb.charAt(sb.length() - 1); char first = chunk.length() == 0 ? 0 : chunk.charAt(0); if ("#=:|\0-".indexOf(first) < 0 && "#=:|\0-".indexOf(last) < 0) { sb.append(" "); } sb.append(chunk); } String result = sb.toString().trim().replaceAll("\\s+", " "); return result; } public void mapperProgress() { } public void reducerProgress() { } /** * adds a query to the queue, for batch-wise retrieval. *

* @param q Query object that contains the query request. */ public void addQueue(Query q) { queue.add(q); } /** * adds all Queries in the ArrayList to the queue for batch-wise retrieval. *

* @param list List of Queries that contain the query requests. */ public void addQueue(ArrayList list) { queue.addAll(list); } public void setQueue(ArrayList list) { queue = (ArrayList)list.clone(); } /** * returns the queue for debugging purposes, but does not retrieveQueries * anything. *

* @return the queued queries */ public ArrayList getQueue() { return queue; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy