
io.github.repir.Strategy.GraphRoot Maven / Gradle / Ivy
The newest version!
package io.github.repir.Strategy;
import io.github.repir.QueryParser.QueryLexer;
import io.github.repir.QueryParser.QueryParser;
import io.github.repir.Repository.Stopwords.StopWords;
import io.github.repir.Repository.StoredFeature;
import io.github.repir.Repository.Term;
import io.github.repir.Repository.TermDocumentFeature;
import io.github.repir.Retriever.Query;
import io.github.repir.Retriever.Retriever;
import io.github.repir.Strategy.Collector.MasterCollector;
import io.github.repir.Strategy.Operator.Operator;
import io.github.repir.Strategy.Operator.ProximityOperator;
import io.github.repir.Strategy.Operator.QTerm;
import io.github.repir.tools.lib.ClassTools;
import io.github.repir.tools.lib.Log;
import io.github.repir.tools.lib.PrintTools;
import io.github.repir.tools.lib.StrTools;
import io.github.repir.tools.Words.englishStemmer;
import java.lang.reflect.Constructor;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.antlr.v4.runtime.ANTLRInputStream;
import org.antlr.v4.runtime.CommonTokenStream;
import org.antlr.v4.runtime.RecognitionException;
/**
* An GraphRoot is responsible for constructing a graph of {@link Operator}s, that
* will process retrieval according to the {@link Query}.
*
* A graph is used by default for {@link RetrievalModel}s, i.e. {@link Strategy}s that
* return a ranked list of {@link Document}s. The graph is constructed by parsing
* the specified {@link Query}, by {@link #buildGraph(Retriever.Query) using the
* {@link QueryParser} to construct a bi-directional graph. The leaf nodes are
* usually {@link QTerm} {@link Operator}s, which are seeded with postings lists of
* term features ({@link TermInverted}), and that operator nodes nearest the root are the nodes
* that contribute directly to the score assigned to the Document being processed.
*
* After the graph has been built, the GraphRoot will prepare retrieval using a
* number of phases: (1) Expansion phase asking nodes to modify the graph if they
* need to expand or be replaced, (2) Announce phase requesting all nodes to
* announce their potential use to their parent node until the announcements reach
* the root, (3) Cleanse phase in which stop words and operators for non existing
* store features are removed, (4) ConfigureContainedFeatures in which complex features
* configure their settings, e.g. weight or span, (5) propagate setTermPositionsNeeded
* across the graph, which is set to true if any feature needs term positions to
* ensure these are passed during processing (6) ReadStatistics in which features
* read their required statistics, e.g. collection/document frequency, (7)
* needsPass() in which any operator can signal that they need a prepass to pre collect
* additional information before being able to operate during the final pass, in which
* case during (8) SetupCollector they install a {@link Collector} to obtain these results.
*
* @author jeroen
*/
public class GraphRoot extends GraphComponent {
public static Log log = new Log(GraphRoot.class);
private HashMap> nodelists = new HashMap>();
//public static englishStemmer stemmer = englishStemmer.get();
public Retriever retriever;
public Query query;
//public Query queryrequest;
public double documentpriorfrequency;
public boolean removenonexist;
public Class phraseclass = ProximityOperator.class;
public GraphRoot(RetrievalModel rm) {
super(rm);
this.query = rm.query;
this.retriever = rm.retriever;
this.removenonexist = repository.configuredBoolean("retriever.removenonexist", true);
}
/**
* Builds an GraphRoot from a {@link Retriever.Query} request. The query string is tokenized
* using {@link Retriever.Retriever#tokenizeString(Retriever.Query)
* }
* and then parsed using {@link QueryParser}, which converts the query into a Graph of
* containednodes that extract and process documents into the data is collected. the query
* request is parsed using {@link QueryParser}.
*
* @param queryrequest
*/
public final void buildGraph() {
try {
//log.info("query %s", query.query);
ANTLRInputStream input = new ANTLRInputStream(retrievalmodel.getQueryToRetrieve());
QueryLexer lexer = new QueryLexer(input);
CommonTokenStream tokens = new CommonTokenStream(lexer);
QueryParser parser = new QueryParser(tokens);
parser.root = this;
parser.prog();
} catch (RecognitionException ex) {
log.exception(ex, "build() queryrequest %s", retrievalmodel.getQueryToRetrieve());
}
}
/**
* This method should be called prior to retrieval. It assembles a list of Features that are to
* extracted from the retrieved documents, which are necessary for the retrieval process. All
* containednodes in the model are then initialized using the statistics for the extracted
* containednodes.
*/
public void prepareRetrieval() {
this.doExpand();
this.doAnnounceContainedFeatures();
this.cleanseModel();
this.doConfigureContainedFeatures();
this.setWillBeScored(true);
this.setTermPositionsNeeded(false);
this.doReadStatistics();
if ( this.needsPrePass() ) {
doSetupCollector();
} else {
}
}
/**
* Called before creating a PostingsIterator, to set dependencies between
* terms used to skip documents that are not scorable.
*/
public void setTDFDependencies() {
for ( StoredFeature f : retrievalmodel.getUsedFeatures() )
if (f instanceof TermDocumentFeature)
((TermDocumentFeature)f).resetDependencies();
for (Operator g : containednodes)
if (!this.needsPrePass() || g.needsCollect())
g.setTDFDependencies();
}
public QTerm getTerm(String termstring) {
return new QTerm(this, repository.getTerm(termstring));
}
public QTerm getTerm(int termid) {
return new QTerm(this, repository.getTerm(termid));
}
/**
* removes all FeatureExtractors with non-existing terms, for which
* {@link #isStemmedStopWord(java.lang.String) } returns true, and empty Features.
*/
public void cleanseModel() {
ArrayList list;
if (query.removeStopwords) {
// remove stopwords
list = this.getAnnounce(ANNOUNCEKEY.STOPWORD);
if (list != null) {
for (int f = list.size() - 1; f >= 0; f--) {
this.remove(list.get(f));
}
}
}
// remove words that are not in the vocabulary
list = this.getAnnounce(ANNOUNCEKEY.NONEXIST);
if (list != null) {
for (int f = list.size() - 1; f >= 0; f--) {
this.remove(list.get(f));
}
}
}
/**
* @return true if any of the nodes needs to do a pre-pass to collect feature data
*/
public boolean needsPrePass() {
ArrayList list = nodelists.get(ANNOUNCEKEY.NEEDSCOLLECT);
if (list != null && list.size() > 0) {
return true;
}
list = nodelists.get(ANNOUNCEKEY.NEEDSCACHECOLLECT);
if (list != null && list.size() > 0) {
return true;
}
return false;
}
/**
* @return the reformulated query after retrieval using collected feature data
*/
public String postReform() {
StringBuilder sb = new StringBuilder();
for (Operator f : containednodes) {
sb.append(f.postReform()).append(" ");
}
return sb.toString();
}
/**
* @param remove feature to be recursively removed from the GraphRoot
*/
@Override
public void remove(Operator remove) {
super.remove(remove);
for (Map.Entry> entry : nodelists.entrySet()) {
Iterator iter = entry.getValue().iterator();
while (iter.hasNext()) {
if (iter.next() == remove) {
iter.remove();
break;
}
}
}
}
/**
* Construct a custom Operator class. This enables query formulation using "featureclass:(
* contained containednodes )". This will go horribly wrong if this class does not exist, or
* if it cannot be constructed with an GraphRoot and ArrayList as parameters.
*
* @param featureclassname case-sensitive classname of the Operator
* @param root the GraphRoot to addQueue the Operator to.
* @param terms a list of contained containednodes
* @return the new Operator
*/
public Operator construct(String featureclassname, ArrayList terms) {
try {
Class featureclass = ClassTools.toClass(featureclassname, Operator.class.getPackage().getName());
Constructor cons = ClassTools.getAssignableConstructor(featureclass, Operator.class, GraphRoot.class, ArrayList.class);
Operator f = (Operator) ClassTools.construct(cons, this, terms);
return f;
} catch (ClassNotFoundException ex) {
log.fatalexception(ex, "construct() invalid feature class %s", featureclassname);
}
return null;
}
/**
* Inverse of construct, to formulate a String that embeds a querystring in a custom Operator
* parent, so that when parsed, the elements in the querystring become elements of the Operator
* parent.
*
* @param parent parent GRaphNode class
* @param query query string to pass as elements
* @return new query string
*/
public static String reformulate(Class parent, String query) {
String classname = parent.getCanonicalName();
classname = StrTools.removeOptionalStart(classname, Operator.class.getPackage().getName() + ".");
return classname + ":(" + query + ")";
}
public static String reformulate(Operator r) {
String rf = reformulateUnweighted(r);
if (r.getQueryWeight() != 1 && r.getQueryWeight() != 0) {
rf = PrintTools.sprintf("%s#%g", rf, r.getQueryWeight());
}
return rf;
}
public static String reformulateWeighted(Operator r) {
StringBuilder sb = new StringBuilder();
sb.append(r.getName()).append(":(");
for (Operator n : r.containednodes)
sb.append(n.postReform()).append(" ");
if (r.getQueryWeight() != 1 && r.getQueryWeight() != 0) {
sb.append(PrintTools.sprintf("#%g", r.getQueryWeight()));
}
return sb.toString();
}
public static String reformulateUnweighted(Operator r) {
StringBuilder sb = new StringBuilder();
sb.append(r.getName()).append(":(");
for (Operator n : r.containednodes)
sb.append(n.postReformUnweighted()).append(" ");
sb.append(")").toString();
return sb.toString();
}
/**
* For debug purposes, recursively prints the GraphRoot
*/
public void print() {
for (Operator f : containednodes) {
log.printf("%s", f.printRecursive(2));
}
}
public ArrayList getAnnounce(ANNOUNCEKEY key) {
ArrayList list = nodelists.get(key);
if (list == null) {
list = new ArrayList();
nodelists.put(key, list);
}
return list;
}
public void announce(ANNOUNCEKEY key, Operator node) {
ArrayList list = getAnnounce(key);
list.add(node);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy