All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.github.repir.Strategy.GraphRoot Maven / Gradle / Ivy

The newest version!
package io.github.repir.Strategy;

import io.github.repir.QueryParser.QueryLexer;
import io.github.repir.QueryParser.QueryParser;
import io.github.repir.Repository.Stopwords.StopWords;
import io.github.repir.Repository.StoredFeature;
import io.github.repir.Repository.Term;
import io.github.repir.Repository.TermDocumentFeature;
import io.github.repir.Retriever.Query;
import io.github.repir.Retriever.Retriever;
import io.github.repir.Strategy.Collector.MasterCollector;
import io.github.repir.Strategy.Operator.Operator;
import io.github.repir.Strategy.Operator.ProximityOperator;
import io.github.repir.Strategy.Operator.QTerm;
import io.github.repir.tools.lib.ClassTools;
import io.github.repir.tools.lib.Log;
import io.github.repir.tools.lib.PrintTools;
import io.github.repir.tools.lib.StrTools;
import io.github.repir.tools.Words.englishStemmer;
import java.lang.reflect.Constructor;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.antlr.v4.runtime.ANTLRInputStream;
import org.antlr.v4.runtime.CommonTokenStream;
import org.antlr.v4.runtime.RecognitionException;

/**
 * An GraphRoot is responsible for constructing a graph of {@link Operator}s, that
 * will process retrieval according to the {@link Query}. 
 * 

* A graph is used by default for {@link RetrievalModel}s, i.e. {@link Strategy}s that * return a ranked list of {@link Document}s. The graph is constructed by parsing * the specified {@link Query}, by {@link #buildGraph(Retriever.Query) using the * {@link QueryParser} to construct a bi-directional graph. The leaf nodes are * usually {@link QTerm} {@link Operator}s, which are seeded with postings lists of * term features ({@link TermInverted}), and that operator nodes nearest the root are the nodes * that contribute directly to the score assigned to the Document being processed. *

* After the graph has been built, the GraphRoot will prepare retrieval using a * number of phases: (1) Expansion phase asking nodes to modify the graph if they * need to expand or be replaced, (2) Announce phase requesting all nodes to * announce their potential use to their parent node until the announcements reach * the root, (3) Cleanse phase in which stop words and operators for non existing * store features are removed, (4) ConfigureContainedFeatures in which complex features * configure their settings, e.g. weight or span, (5) propagate setTermPositionsNeeded * across the graph, which is set to true if any feature needs term positions to * ensure these are passed during processing (6) ReadStatistics in which features * read their required statistics, e.g. collection/document frequency, (7) * needsPass() in which any operator can signal that they need a prepass to pre collect * additional information before being able to operate during the final pass, in which * case during (8) SetupCollector they install a {@link Collector} to obtain these results. *

* @author jeroen */ public class GraphRoot extends GraphComponent { public static Log log = new Log(GraphRoot.class); private HashMap> nodelists = new HashMap>(); //public static englishStemmer stemmer = englishStemmer.get(); public Retriever retriever; public Query query; //public Query queryrequest; public double documentpriorfrequency; public boolean removenonexist; public Class phraseclass = ProximityOperator.class; public GraphRoot(RetrievalModel rm) { super(rm); this.query = rm.query; this.retriever = rm.retriever; this.removenonexist = repository.configuredBoolean("retriever.removenonexist", true); } /** * Builds an GraphRoot from a {@link Retriever.Query} request. The query string is tokenized * using {@link Retriever.Retriever#tokenizeString(Retriever.Query) * } * and then parsed using {@link QueryParser}, which converts the query into a Graph of * containednodes that extract and process documents into the data is collected. the query * request is parsed using {@link QueryParser}. *

* @param queryrequest */ public final void buildGraph() { try { //log.info("query %s", query.query); ANTLRInputStream input = new ANTLRInputStream(retrievalmodel.getQueryToRetrieve()); QueryLexer lexer = new QueryLexer(input); CommonTokenStream tokens = new CommonTokenStream(lexer); QueryParser parser = new QueryParser(tokens); parser.root = this; parser.prog(); } catch (RecognitionException ex) { log.exception(ex, "build() queryrequest %s", retrievalmodel.getQueryToRetrieve()); } } /** * This method should be called prior to retrieval. It assembles a list of Features that are to * extracted from the retrieved documents, which are necessary for the retrieval process. All * containednodes in the model are then initialized using the statistics for the extracted * containednodes. */ public void prepareRetrieval() { this.doExpand(); this.doAnnounceContainedFeatures(); this.cleanseModel(); this.doConfigureContainedFeatures(); this.setWillBeScored(true); this.setTermPositionsNeeded(false); this.doReadStatistics(); if ( this.needsPrePass() ) { doSetupCollector(); } else { } } /** * Called before creating a PostingsIterator, to set dependencies between * terms used to skip documents that are not scorable. */ public void setTDFDependencies() { for ( StoredFeature f : retrievalmodel.getUsedFeatures() ) if (f instanceof TermDocumentFeature) ((TermDocumentFeature)f).resetDependencies(); for (Operator g : containednodes) if (!this.needsPrePass() || g.needsCollect()) g.setTDFDependencies(); } public QTerm getTerm(String termstring) { return new QTerm(this, repository.getTerm(termstring)); } public QTerm getTerm(int termid) { return new QTerm(this, repository.getTerm(termid)); } /** * removes all FeatureExtractors with non-existing terms, for which * {@link #isStemmedStopWord(java.lang.String) } returns true, and empty Features. */ public void cleanseModel() { ArrayList list; if (query.removeStopwords) { // remove stopwords list = this.getAnnounce(ANNOUNCEKEY.STOPWORD); if (list != null) { for (int f = list.size() - 1; f >= 0; f--) { this.remove(list.get(f)); } } } // remove words that are not in the vocabulary list = this.getAnnounce(ANNOUNCEKEY.NONEXIST); if (list != null) { for (int f = list.size() - 1; f >= 0; f--) { this.remove(list.get(f)); } } } /** * @return true if any of the nodes needs to do a pre-pass to collect feature data */ public boolean needsPrePass() { ArrayList list = nodelists.get(ANNOUNCEKEY.NEEDSCOLLECT); if (list != null && list.size() > 0) { return true; } list = nodelists.get(ANNOUNCEKEY.NEEDSCACHECOLLECT); if (list != null && list.size() > 0) { return true; } return false; } /** * @return the reformulated query after retrieval using collected feature data */ public String postReform() { StringBuilder sb = new StringBuilder(); for (Operator f : containednodes) { sb.append(f.postReform()).append(" "); } return sb.toString(); } /** * @param remove feature to be recursively removed from the GraphRoot */ @Override public void remove(Operator remove) { super.remove(remove); for (Map.Entry> entry : nodelists.entrySet()) { Iterator iter = entry.getValue().iterator(); while (iter.hasNext()) { if (iter.next() == remove) { iter.remove(); break; } } } } /** * Construct a custom Operator class. This enables query formulation using "featureclass:( * contained containednodes )". This will go horribly wrong if this class does not exist, or * if it cannot be constructed with an GraphRoot and ArrayList as parameters. *

* @param featureclassname case-sensitive classname of the Operator * @param root the GraphRoot to addQueue the Operator to. * @param terms a list of contained containednodes * @return the new Operator */ public Operator construct(String featureclassname, ArrayList terms) { try { Class featureclass = ClassTools.toClass(featureclassname, Operator.class.getPackage().getName()); Constructor cons = ClassTools.getAssignableConstructor(featureclass, Operator.class, GraphRoot.class, ArrayList.class); Operator f = (Operator) ClassTools.construct(cons, this, terms); return f; } catch (ClassNotFoundException ex) { log.fatalexception(ex, "construct() invalid feature class %s", featureclassname); } return null; } /** * Inverse of construct, to formulate a String that embeds a querystring in a custom Operator * parent, so that when parsed, the elements in the querystring become elements of the Operator * parent. *

* @param parent parent GRaphNode class * @param query query string to pass as elements * @return new query string */ public static String reformulate(Class parent, String query) { String classname = parent.getCanonicalName(); classname = StrTools.removeOptionalStart(classname, Operator.class.getPackage().getName() + "."); return classname + ":(" + query + ")"; } public static String reformulate(Operator r) { String rf = reformulateUnweighted(r); if (r.getQueryWeight() != 1 && r.getQueryWeight() != 0) { rf = PrintTools.sprintf("%s#%g", rf, r.getQueryWeight()); } return rf; } public static String reformulateWeighted(Operator r) { StringBuilder sb = new StringBuilder(); sb.append(r.getName()).append(":("); for (Operator n : r.containednodes) sb.append(n.postReform()).append(" "); if (r.getQueryWeight() != 1 && r.getQueryWeight() != 0) { sb.append(PrintTools.sprintf("#%g", r.getQueryWeight())); } return sb.toString(); } public static String reformulateUnweighted(Operator r) { StringBuilder sb = new StringBuilder(); sb.append(r.getName()).append(":("); for (Operator n : r.containednodes) sb.append(n.postReformUnweighted()).append(" "); sb.append(")").toString(); return sb.toString(); } /** * For debug purposes, recursively prints the GraphRoot */ public void print() { for (Operator f : containednodes) { log.printf("%s", f.printRecursive(2)); } } public ArrayList getAnnounce(ANNOUNCEKEY key) { ArrayList list = nodelists.get(key); if (list == null) { list = new ArrayList(); nodelists.put(key, list); } return list; } public void announce(ANNOUNCEKEY key, Operator node) { ArrayList list = getAnnounce(key); list.add(node); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy