querqy.lucene.contrib.rewrite.wordbreak.WordBreakCompoundRewriter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of querqy-lucene Show documentation
Show all versions of querqy-lucene Show documentation
Querqy library for query rewriting for Lucene
The newest version!
package querqy.lucene.contrib.rewrite.wordbreak;
import org.apache.lucene.index.IndexReader;
import querqy.LowerCaseCharSequence;
import querqy.model.AbstractNodeVisitor;
import querqy.model.BooleanClause;
import querqy.model.BooleanQuery;
import querqy.model.Clause;
import querqy.model.DisjunctionMaxClause;
import querqy.model.DisjunctionMaxQuery;
import querqy.model.ExpandedQuery;
import querqy.model.Node;
import querqy.model.QuerqyQuery;
import querqy.model.Query;
import querqy.rewrite.RewriterOutput;
import querqy.model.Term;
import querqy.rewrite.QueryRewriter;
import querqy.rewrite.SearchEngineRequestAdapter;
import querqy.trie.TrieMap;
import java.io.IOException;
import java.util.ArrayDeque;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Objects;
public class WordBreakCompoundRewriter extends AbstractNodeVisitor implements QueryRewriter {
private final LuceneWordBreaker wordBreaker;
private final LuceneCompounder compounder;
private final IndexReader indexReader;
private final boolean lowerCaseInput;
// We are not using this as a map but as a kind of a set to look up CharSequences quickly
private final TrieMap reverseCompoundTriggerWords;
private ArrayDeque previousTerms = null;
private ArrayDeque termsToDelete = null;
//
private List nodesToAdd = null;
private final boolean alwaysAddReverseCompounds;
private final int maxDecompoundExpansions;
private final boolean verifyDecompoundCollation;
private final TrieMap protectedWords;
/**
* @param wordBreaker The word breaker to use
* @param compounder The compounder to use
* @param indexReader The index reader
* @param lowerCaseInput Iff true, lowercase input before matching it against the dictionary field.
* @param alwaysAddReverseCompounds Iff true, reverse shingles will be added to the query
* @param reverseCompoundTriggerWords Query tokens found as keys in this map will trigger the creation of a reverse compound of the surrounding tokens.
* @param maxDecompoundExpansions The maximum number of decompounds to add to the query
* @param verifyDecompoundCollation Iff true, verify that all parts of the compound cooccur in dictionaryField after decompounding
* @param protectedWords The "false-positive" set of terms that should never be split or be result of a combination
*/
public WordBreakCompoundRewriter(final LuceneWordBreaker wordBreaker, final LuceneCompounder compounder,
final IndexReader indexReader,
final boolean lowerCaseInput, final boolean alwaysAddReverseCompounds,
final TrieMap reverseCompoundTriggerWords,
final int maxDecompoundExpansions, final boolean verifyDecompoundCollation,
final TrieMap protectedWords) {
if (reverseCompoundTriggerWords == null) {
throw new IllegalArgumentException("reverseCompoundTriggerWords must not be null");
}
this.wordBreaker = wordBreaker;
this.compounder = compounder;
this.alwaysAddReverseCompounds = alwaysAddReverseCompounds;
this.reverseCompoundTriggerWords = reverseCompoundTriggerWords;
this.maxDecompoundExpansions = maxDecompoundExpansions;
this.verifyDecompoundCollation = verifyDecompoundCollation;
this.indexReader = indexReader;
this.lowerCaseInput = lowerCaseInput;
this.protectedWords = protectedWords;
}
@Override
public RewriterOutput rewrite(final ExpandedQuery query, final SearchEngineRequestAdapter requestAdapter) {
final QuerqyQuery> userQuery = query.getUserQuery();
if (userQuery instanceof Query){
previousTerms = new ArrayDeque<>();
termsToDelete = new ArrayDeque<>();
nodesToAdd = new LinkedList<>();
visit((Query) userQuery);
// append nodesToAdd to parent query
nodesToAdd.forEach(node -> {
final Node parent = node.getParent();
// TODO: extend BooleanParent? interface so that we don't need this cast?
if (parent instanceof DisjunctionMaxQuery) {
((DisjunctionMaxQuery) parent).addClause((DisjunctionMaxClause) node);
} else if (parent instanceof BooleanQuery) {
((BooleanQuery) parent).addClause((BooleanClause) node);
} else {
throw new IllegalStateException("Unknown parent type " + parent.getClass().getName());
}
});
termsToDelete.forEach(this::removeIfNotOnlyChild);
}
return RewriterOutput.builder().expandedQuery(query).build();
}
public void removeIfNotOnlyChild(final Term term) {
// remove the term from its parent. If the parent doesn't have any further child,
// remove the parent from the grand-parent. If this also hasn't any further child,
// do not remove anything
// TODO: go until top level?
final DisjunctionMaxQuery parentQuery = term.getParent();
if (parentQuery.getClauses().size() > 1) {
parentQuery.removeClause(term);
} else {
final BooleanQuery grandParent = parentQuery.getParent();
if (grandParent != null && grandParent.getClauses().size() > 1) {
grandParent.removeClause(parentQuery);
}
}
}
@Override
public Node visit(final DisjunctionMaxQuery dmq) {
final List clauses = dmq.getClauses();
if (clauses != null && !clauses.isEmpty()) {
DisjunctionMaxClause nonGeneratedClause = null;
for (final DisjunctionMaxClause clause: clauses) {
if (!clause.isGenerated()) {
// second non-generated clause - cannot handle this
if (nonGeneratedClause != null) {
throw new IllegalArgumentException("cannot handle more then one non-generated DMQ clause");
}
nonGeneratedClause = clause;
}
}
if (nonGeneratedClause != null) {
nonGeneratedClause.accept(this);
}
}
return null;
}
@Override
public Node visit(final Term term) {
// don't handle generated terms
if (!term.isGenerated()) {
if (isReverseCompoundTriggerWord(term.getValue())) {
termsToDelete.add(term);
} else {
if (!isProtectedWord(term.getValue())) {
decompound(term);
}
compound(term);
}
previousTerms.add(term);
}
return term;
}
protected void decompound(final Term term) {
// determine the nodesToAdd based on the term
try {
for (final CharSequence[] decompounded : wordBreaker.breakWord(term, indexReader, maxDecompoundExpansions,
verifyDecompoundCollation)) {
if (decompounded != null && decompounded.length > 0) {
final BooleanQuery bq = new BooleanQuery(term.getParent(), Clause.Occur.SHOULD, true);
for (final CharSequence word : decompounded) {
final DisjunctionMaxQuery dmq = new DisjunctionMaxQuery(bq, Clause.Occur.MUST, true);
bq.addClause(dmq);
dmq.addClause(new Term(dmq, term.getField(), word, true));
}
nodesToAdd.add(bq);
}
}
} catch (final IOException e) {
// IO is broken, this looks serious -> throw as RTE
throw new RuntimeException("Error decompounding " + term, e);
}
}
protected void compound(final Term term) {
if (!previousTerms.isEmpty()) {
boolean reverseCompound = false;
// calculate the compounds based on term and the previous term,
// also possibly including its predecessor if the term before was a "compound reversal" trigger
final Iterator previousTermsIterator = new TermsFromFieldIterator(previousTerms.descendingIterator(),
term.getField());
Term previousTerm = null;
while (previousTermsIterator.hasNext() && previousTerm == null) {
final Term maybePreviousTerm = previousTermsIterator.next();
if (isReverseCompoundTriggerWord(maybePreviousTerm.getValue())) {
reverseCompound = true;
} else {
previousTerm = maybePreviousTerm;
}
}
if (previousTerm != null) {
final Term[] compoundTerms = new Term[] {previousTerm, term};
try {
addCompounds(compoundTerms, false);
if (reverseCompound || alwaysAddReverseCompounds) {
addCompounds(compoundTerms, true);
}
} catch (final IOException e) {
throw new RuntimeException("Error while compounding " + term, e);
}
}
}
}
private void addCompounds(final Term[] terms, final boolean reverse) throws IOException {
for (final LuceneCompounder.CompoundTerm compoundTerm : compounder.combine(terms, indexReader, reverse)) {
if (!isProtectedWord(compoundTerm.value)) {
for (final Term sibling: compoundTerm.originalTerms) {
nodesToAdd.add(new Term(sibling.getParent(), sibling.getField(), compoundTerm.value, true));
}
}
}
}
private boolean isReverseCompoundTriggerWord(final CharSequence chars) {
return reverseCompoundTriggerWords.get(lowerCaseInput ? new LowerCaseCharSequence(chars) : chars)
.getStateForCompleteSequence().isFinal();
}
private boolean isProtectedWord(final CharSequence chars) {
return protectedWords.get(lowerCaseInput ? new LowerCaseCharSequence(chars) : chars)
.getStateForCompleteSequence().isFinal();
}
@Override
public Node visit(final BooleanQuery bq) {
previousTerms.clear();
return super.visit(bq);
}
public static class MaxSortable implements Comparable> {
public final T obj;
public final int count;
public MaxSortable(final T obj, final int count) {
this.obj = obj;
this.count = count;
}
@Override
public int compareTo(final MaxSortable o) {
// reverse order
return Integer.compare(o.count, this.count);
}
}
// Iterator wrapper that only iterates as long as it can emit terms from a given field
private static class TermsFromFieldIterator implements Iterator {
private final Iterator delegate;
private final String field;
private Term slot = null;
public TermsFromFieldIterator(final Iterator delegate, final String field) {
this.delegate = delegate;
this.field = field;
}
@Override
public boolean hasNext() {
return tryFillSlotIfEmpty() && Objects.equals(slot.getField(), field);
}
@Override
public Term next() {
tryFillSlotIfEmpty();
if (slot == null || !Objects.equals(slot.getField(), field)) {
throw new NoSuchElementException("No more terms");
} else {
Term term = slot;
slot = null;
return term;
}
}
private boolean tryFillSlotIfEmpty() {
if (slot == null && delegate.hasNext()) {
slot = delegate.next();
return true;
} else {
return false;
}
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy