All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.github.repir.Strategy.Operator.SynonymOperator Maven / Gradle / Ivy

The newest version!
package io.github.repir.Strategy.Operator;

import java.util.ArrayList;
import java.util.TreeSet;
import io.github.repir.Strategy.Collector.CollectorSynonym;
import io.github.repir.Repository.SynStats;
import io.github.repir.Repository.SynStats.Record;
import io.github.repir.Repository.TermDocumentFeature;
import io.github.repir.Retriever.Document;
import io.github.repir.Strategy.GraphRoot;
import io.github.repir.tools.lib.ArrayTools;
import io.github.repir.tools.lib.Log;
import io.github.repir.tools.lib.PrintTools;

/**
 * Resolves the contained operators as if they are all occurrences of the same imaginary
 * symbol. Scoring of a synonym is done using the collection and document frequencies 
 * of this imaginary symbol, e.g. "[animal beast]" will have a collection frequency that is
 * the sum of the collection frequencies of "animal" and "beast", and the document frequency
 * is an exact count of the document in which at least one of these words appear. 
 * If the required statistics are not present in {@link SynStats}, this Operator will 
 * request a pre-pass to obtain its statistics and storing them in {@link SynStats}
 * for future reuse.
 * 

* @author jeroen */ public class SynonymOperator extends CachableOperator { public static Log log = new Log(SynonymOperator.class); public CollectorSynonym collector; public SynStats synstats; public SynonymOperator(GraphRoot im, ArrayList list) { super(im, list); this.sortContainedFeatures(); } @Override public boolean expand() { return containednodes.size() < 2; } public void doAnnounce() { super.announce( ANNOUNCEKEY.SCORABLE, this ); } @Override public void configureFeatures() { if (containednodes.size() == 1) { root.replace(this, containednodes); } else { ArrayList sorted = new ArrayList(); for (Operator g : containednodes) { int pos = 0; while (pos < sorted.size()) { Operator h = sorted.get(pos); if (g instanceof QTerm && !(h instanceof QTerm)) break; if (g instanceof QTerm && ((QTerm)g).getTermID() < ((QTerm)h).getTermID()) break; if (g instanceof ProximityOperator && h instanceof ProximityOperator) { if (g.postReform().compareTo(h.postReform()) >= 0) break; } pos++; } sorted.add(pos, g); } containednodes = sorted; } } @Override public void setTDFDependencies() { for (Operator g : containednodes) g.setTDFDependencies(); } @Override public ArrayList getRequiredTDF() { return new ArrayList(); } /** * Prohibit the announcement of contained query as SCORABLE, because the ProximityOperator will be * scored only at the phrase level. *

* @param key * @param node */ public void announce(ANNOUNCEKEY key, Operator node) { if (key != ANNOUNCEKEY.SCORABLE) { super.announce(key, node); } } @Override public void setupCollector() { collector = new CollectorSynonym(this); } @Override public void process(Document doc) { frequency = 0; TreeSet list = new TreeSet(); int size = 0; for (Operator f : containednodes) { f.process(doc); frequency += f.getFrequency(); int fpos[] = f.getPos(); if (fpos != null && fpos.length > 0) { for (int pos : fpos) list.add(pos); } } int p = 0; pos = ArrayTools.toIntArray(list); } @Override public Operator clone(GraphRoot newmodel) { SynonymOperator f = new SynonymOperator(newmodel, containednodes); f.parent = parent; return f; } @Override public String postReform() { String rf = postReformUnweighted(); if (getQueryWeight() != 1 && getQueryWeight() != 0) { return PrintTools.sprintf("%s#%e", rf, getQueryWeight()); } else return rf; } @Override public String postReformUnweighted() { StringBuilder sb = new StringBuilder(); sb.append("["); if (getCF() > -1) { sb.append(io.github.repir.tools.lib.PrintTools.sprintf("cf=%d ", getCF())); } if (getDF() > -1) { sb.append(io.github.repir.tools.lib.PrintTools.sprintf("df=%d ", getDF())); } for (Operator f : containednodes) { sb.append(f.postReformUnweighted()).append(" "); } return sb.append("]").toString(); } /** * At the end of the pre pass this method is called by the collector to * process the obtained statistics in such a way that * by reformulating the query, the required statistics are written in the query. */ public void processCollected() { cf = collector.cf; df = collector.df; if (cf == 0 && root.removenonexist) root.remove(this); } @Override public String toTermString() { return "[" + toTermString(containednodes) + ']'; } public SynStats getCache() { if (synstats == null) { synstats = SynStats.get(repository); synstats.openRead(); } return synstats; } public Record createRecord() { SynStats cache = getCache(); Record r = (Record) cache.newRecord(); r.query = postReformUnweighted(); return r; } @Override public void readCachedData() { SynStats cache = getCache(); Record s = createRecord(); Record r = cache.find(s); if (r != null) { cf = r.cf; df = r.df; } } @Override public void prepareRetrieval() { } @Override public String toString() { return io.github.repir.tools.lib.PrintTools.sprintf("FeatureSynonym[%d] weight %f\n", this.containednodes.size(), this.getQueryWeight()); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy