org.apache.lucene.search.suggest.document.FuzzyCompletionQuery Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.suggest.document;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.suggest.BitsProducer;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.FiniteStringsIterator;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.UTF32ToUTF8;
/**
* A {@link CompletionQuery} that match documents containing terms within an edit distance of the
* specified prefix.
*
* This query boost documents relative to how similar the indexed terms are to the provided
* prefix.
*
*
Example usage of querying an analyzed prefix within an edit distance of 1 of 'subg' against a
* field 'suggest_field' is as follows:
*
*
* CompletionQuery query = new FuzzyCompletionQuery(analyzer, new Term("suggest_field", "subg"));
*
*
* @lucene.experimental
*/
public class FuzzyCompletionQuery extends PrefixCompletionQuery {
/**
* Measure maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix parameters in Unicode code
* points (actual letters) instead of bytes.
*/
public static final boolean DEFAULT_UNICODE_AWARE = false;
/** The default minimum length of the key before any edits are allowed. */
public static final int DEFAULT_MIN_FUZZY_LENGTH = 3;
/** The default prefix length where edits are not allowed. */
public static final int DEFAULT_NON_FUZZY_PREFIX = 1;
/** The default maximum number of edits for fuzzy suggestions. */
public static final int DEFAULT_MAX_EDITS = 1;
/** The default transposition value passed to {@link LevenshteinAutomata} */
public static final boolean DEFAULT_TRANSPOSITIONS = true;
private final int maxEdits;
private final boolean transpositions;
private final int nonFuzzyPrefix;
private final int minFuzzyLength;
private final boolean unicodeAware;
private final int determinizeWorkLimit;
/**
* Calls {@link FuzzyCompletionQuery#FuzzyCompletionQuery(Analyzer, Term, BitsProducer)} with no
* filter
*/
public FuzzyCompletionQuery(Analyzer analyzer, Term term) {
this(analyzer, term, null);
}
/**
* Calls {@link FuzzyCompletionQuery#FuzzyCompletionQuery(Analyzer, Term, BitsProducer, int,
* boolean, int, int, boolean, int)} with defaults for maxEdits
, transpositions
*
, nonFuzzyPrefix
, minFuzzyLength
, unicodeAware
* and determinizeWorkLimit
See {@link #DEFAULT_MAX_EDITS}, {@link
* #DEFAULT_TRANSPOSITIONS}, {@link #DEFAULT_NON_FUZZY_PREFIX}, {@link #DEFAULT_MIN_FUZZY_LENGTH},
* {@link #DEFAULT_UNICODE_AWARE} and {@link Operations#DEFAULT_DETERMINIZE_WORK_LIMIT} for
* defaults
*/
public FuzzyCompletionQuery(Analyzer analyzer, Term term, BitsProducer filter) {
this(
analyzer,
term,
filter,
DEFAULT_MAX_EDITS,
DEFAULT_TRANSPOSITIONS,
DEFAULT_NON_FUZZY_PREFIX,
DEFAULT_MIN_FUZZY_LENGTH,
DEFAULT_UNICODE_AWARE,
Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
}
/**
* Constructs an analyzed fuzzy prefix completion query
*
* @param analyzer used to analyze the provided {@link Term#text()}
* @param term query is run against {@link Term#field()} and {@link Term#text()} is analyzed with
* analyzer
* @param filter used to query on a sub set of documents
* @param maxEdits maximum number of acceptable edits
* @param transpositions value passed to {@link LevenshteinAutomata}
* @param nonFuzzyPrefix prefix length where edits are not allowed
* @param minFuzzyLength minimum prefix length before any edits are allowed
* @param unicodeAware treat prefix as unicode rather than bytes
* @param determinizeWorkLimit maximum effort allowed to determinize the {@link
* LevenshteinAutomata}
*/
public FuzzyCompletionQuery(
Analyzer analyzer,
Term term,
BitsProducer filter,
int maxEdits,
boolean transpositions,
int nonFuzzyPrefix,
int minFuzzyLength,
boolean unicodeAware,
int determinizeWorkLimit) {
super(analyzer, term, filter);
this.maxEdits = maxEdits;
this.transpositions = transpositions;
this.nonFuzzyPrefix = nonFuzzyPrefix;
this.minFuzzyLength = minFuzzyLength;
this.unicodeAware = unicodeAware;
this.determinizeWorkLimit = determinizeWorkLimit;
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
throws IOException {
final Automaton originalAutomata;
try (CompletionTokenStream stream =
(CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text())) {
originalAutomata = stream.toAutomaton(unicodeAware);
}
Set refs = new HashSet<>();
Automaton automaton = toLevenshteinAutomata(originalAutomata, refs);
if (unicodeAware) {
Automaton utf8automaton = new UTF32ToUTF8().convert(automaton);
utf8automaton = Operations.determinize(utf8automaton, determinizeWorkLimit);
automaton = utf8automaton;
}
// TODO Accumulating all refs is bad, because the resulting set may be very big.
// TODO Better iterate over automaton again inside FuzzyCompletionWeight?
return new FuzzyCompletionWeight(this, automaton, refs);
}
private Automaton toLevenshteinAutomata(Automaton automaton, Set refs) {
List subs = new ArrayList<>();
FiniteStringsIterator finiteStrings = new FiniteStringsIterator(automaton);
for (IntsRef string; (string = finiteStrings.next()) != null; ) {
refs.add(IntsRef.deepCopyOf(string));
if (string.length <= nonFuzzyPrefix || string.length < minFuzzyLength) {
subs.add(Automata.makeString(string.ints, string.offset, string.length));
} else {
int[] ints = new int[string.length - nonFuzzyPrefix];
System.arraycopy(string.ints, string.offset + nonFuzzyPrefix, ints, 0, ints.length);
// TODO: maybe add alphaMin to LevenshteinAutomata,
// and pass 1 instead of 0? We probably don't want
// to allow the trailing dedup bytes to be
// edited... but then 0 byte is "in general" allowed
// on input (but not in UTF8).
LevenshteinAutomata lev =
new LevenshteinAutomata(
ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions);
subs.add(
lev.toAutomaton(
maxEdits, UnicodeUtil.newString(string.ints, string.offset, nonFuzzyPrefix)));
}
}
if (subs.isEmpty()) {
// automaton is empty, there is no accepted paths through it
return Automata.makeEmpty(); // matches nothing
} else if (subs.size() == 1) {
// no synonyms or anything: just a single path through the tokenstream
return subs.get(0);
} else {
// multiple paths: this is really scary! is it slow?
// maybe we should not do this and throw UOE?
Automaton a = Operations.union(subs);
// TODO: we could call toLevenshteinAutomata() before det?
// this only happens if you have multiple paths anyway (e.g. synonyms)
return Operations.determinize(a, determinizeWorkLimit);
}
}
/** Get the maximum edit distance for fuzzy matches */
public int getMaxEdits() {
return maxEdits;
}
/** Return whether transpositions count as a single edit */
public boolean isTranspositions() {
return transpositions;
}
/** Get the length of a prefix where no edits are permitted */
public int getNonFuzzyPrefix() {
return nonFuzzyPrefix;
}
/** Get the minimum length of a term considered for matching */
public int getMinFuzzyLength() {
return minFuzzyLength;
}
/** Return true if lengths are measured in unicode code-points rather than bytes */
public boolean isUnicodeAware() {
return unicodeAware;
}
/** Get the maximum effort to use determinizing */
public int getDeterminizeWorkLimit() {
return determinizeWorkLimit;
}
@Override
public String toString(String field) {
StringBuilder buffer = new StringBuilder();
if (!getField().equals(field)) {
buffer.append(getField());
buffer.append(":");
}
buffer.append(getTerm().text());
buffer.append('*');
buffer.append('~');
buffer.append(Integer.toString(maxEdits));
if (getFilter() != null) {
buffer.append(",");
buffer.append("filter");
buffer.append(getFilter().toString());
}
return buffer.toString();
}
private static class FuzzyCompletionWeight extends CompletionWeight {
private final Set refs;
int currentBoost = 0;
public FuzzyCompletionWeight(CompletionQuery query, Automaton automaton, Set refs)
throws IOException {
super(query, automaton);
this.refs = refs;
}
@Override
protected void setNextMatch(IntsRef pathPrefix) {
// NOTE: the last letter of the matched prefix for the exact
// match never makes it through here
// so an exact match and a match with only a edit at the
// end is boosted the same
int maxCount = 0;
for (IntsRef ref : refs) {
int minLength = Math.min(ref.length, pathPrefix.length);
int count = 0;
for (int i = 0; i < minLength; i++) {
if (ref.ints[i + ref.offset] == pathPrefix.ints[i + pathPrefix.offset]) {
count++;
} else {
break;
}
}
maxCount = Math.max(maxCount, count);
}
currentBoost = maxCount;
}
@Override
protected float boost() {
return currentBoost;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy