All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.search.suggest.document.FuzzyCompletionQuery Maven / Gradle / Ivy

There is a newer version: 10.1.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.suggest.document;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.suggest.BitsProducer;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.FiniteStringsIterator;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.UTF32ToUTF8;

/**
 * A {@link CompletionQuery} that match documents containing terms
 * within an edit distance of the specified prefix.
 * 

* This query boost documents relative to how similar the indexed terms are to the * provided prefix. *

* Example usage of querying an analyzed prefix within an edit distance of 1 of 'subg' * against a field 'suggest_field' is as follows: * *

 *  CompletionQuery query = new FuzzyCompletionQuery(analyzer, new Term("suggest_field", "subg"));
 * 
* * @lucene.experimental */ public class FuzzyCompletionQuery extends PrefixCompletionQuery { /** * Measure maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix * parameters in Unicode code points (actual letters) * instead of bytes. * */ public static final boolean DEFAULT_UNICODE_AWARE = false; /** * The default minimum length of the key before any edits are allowed. */ public static final int DEFAULT_MIN_FUZZY_LENGTH = 3; /** * The default prefix length where edits are not allowed. */ public static final int DEFAULT_NON_FUZZY_PREFIX = 1; /** * The default maximum number of edits for fuzzy * suggestions. */ public static final int DEFAULT_MAX_EDITS = 1; /** * The default transposition value passed to {@link LevenshteinAutomata} */ public static final boolean DEFAULT_TRANSPOSITIONS = true; private final int maxEdits; private final boolean transpositions; private final int nonFuzzyPrefix; private final int minFuzzyLength; private final boolean unicodeAware; private final int maxDeterminizedStates; /** * Calls {@link FuzzyCompletionQuery#FuzzyCompletionQuery(Analyzer, Term, BitsProducer)} * with no filter */ public FuzzyCompletionQuery(Analyzer analyzer, Term term) { this(analyzer, term, null); } /** * Calls {@link FuzzyCompletionQuery#FuzzyCompletionQuery(Analyzer, Term, BitsProducer, * int, boolean, int, int, boolean, int)} * with defaults for maxEdits, transpositions, * nonFuzzyPrefix, minFuzzyLength, * unicodeAware and maxDeterminizedStates * * See {@link #DEFAULT_MAX_EDITS}, {@link #DEFAULT_TRANSPOSITIONS}, * {@link #DEFAULT_NON_FUZZY_PREFIX}, {@link #DEFAULT_MIN_FUZZY_LENGTH}, * {@link #DEFAULT_UNICODE_AWARE} and {@link Operations#DEFAULT_MAX_DETERMINIZED_STATES} * for defaults */ public FuzzyCompletionQuery(Analyzer analyzer, Term term, BitsProducer filter) { this(analyzer, term, filter, DEFAULT_MAX_EDITS, DEFAULT_TRANSPOSITIONS, DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_UNICODE_AWARE, Operations.DEFAULT_MAX_DETERMINIZED_STATES ); } /** * Constructs an analyzed fuzzy prefix completion query * * @param analyzer used to analyze the provided {@link Term#text()} * @param term query is run against {@link Term#field()} and {@link Term#text()} * is analyzed with analyzer * @param filter used to query on a sub set of documents * @param maxEdits maximum number of acceptable edits * @param transpositions value passed to {@link LevenshteinAutomata} * @param nonFuzzyPrefix prefix length where edits are not allowed * @param minFuzzyLength minimum prefix length before any edits are allowed * @param unicodeAware treat prefix as unicode rather than bytes * @param maxDeterminizedStates maximum automaton states allowed for {@link LevenshteinAutomata} */ public FuzzyCompletionQuery(Analyzer analyzer, Term term, BitsProducer filter, int maxEdits, boolean transpositions, int nonFuzzyPrefix, int minFuzzyLength, boolean unicodeAware, int maxDeterminizedStates) { super(analyzer, term, filter); this.maxEdits = maxEdits; this.transpositions = transpositions; this.nonFuzzyPrefix = nonFuzzyPrefix; this.minFuzzyLength = minFuzzyLength; this.unicodeAware = unicodeAware; this.maxDeterminizedStates = maxDeterminizedStates; } @Override public Weight createWeight(IndexSearcher searcher, boolean needsScores) throws IOException { CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text()); Set refs = new HashSet<>(); Automaton automaton = toLevenshteinAutomata(stream.toAutomaton(unicodeAware), refs); if (unicodeAware) { Automaton utf8automaton = new UTF32ToUTF8().convert(automaton); utf8automaton = Operations.determinize(utf8automaton, maxDeterminizedStates); automaton = utf8automaton; } // TODO Accumulating all refs is bad, because the resulting set may be very big. // TODO Better iterate over automaton again inside FuzzyCompletionWeight? return new FuzzyCompletionWeight(this, automaton, refs); } private Automaton toLevenshteinAutomata(Automaton automaton, Set refs) { List subs = new ArrayList<>(); FiniteStringsIterator finiteStrings = new FiniteStringsIterator(automaton); for (IntsRef string; (string = finiteStrings.next()) != null;) { refs.add(IntsRef.deepCopyOf(string)); if (string.length <= nonFuzzyPrefix || string.length < minFuzzyLength) { subs.add(Automata.makeString(string.ints, string.offset, string.length)); } else { int ints[] = new int[string.length - nonFuzzyPrefix]; System.arraycopy(string.ints, string.offset + nonFuzzyPrefix, ints, 0, ints.length); // TODO: maybe add alphaMin to LevenshteinAutomata, // and pass 1 instead of 0? We probably don't want // to allow the trailing dedup bytes to be // edited... but then 0 byte is "in general" allowed // on input (but not in UTF8). LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions); subs.add(lev.toAutomaton(maxEdits, UnicodeUtil.newString(string.ints, string.offset, nonFuzzyPrefix))); } } if (subs.isEmpty()) { // automaton is empty, there is no accepted paths through it return Automata.makeEmpty(); // matches nothing } else if (subs.size() == 1) { // no synonyms or anything: just a single path through the tokenstream return subs.get(0); } else { // multiple paths: this is really scary! is it slow? // maybe we should not do this and throw UOE? Automaton a = Operations.union(subs); // TODO: we could call toLevenshteinAutomata() before det? // this only happens if you have multiple paths anyway (e.g. synonyms) return Operations.determinize(a, maxDeterminizedStates); } } /** * Get the maximum edit distance for fuzzy matches */ public int getMaxEdits() { return maxEdits; } /** * Return whether transpositions count as a single edit */ public boolean isTranspositions() { return transpositions; } /** * Get the length of a prefix where no edits are permitted */ public int getNonFuzzyPrefix() { return nonFuzzyPrefix; } /** * Get the minimum length of a term considered for matching */ public int getMinFuzzyLength() { return minFuzzyLength; } /** * Return true if lengths are measured in unicode code-points rather than bytes */ public boolean isUnicodeAware() { return unicodeAware; } /** * Get the maximum number of determinized states permitted */ public int getMaxDeterminizedStates() { return maxDeterminizedStates; } @Override public String toString(String field) { StringBuilder buffer = new StringBuilder(); if (!getField().equals(field)) { buffer.append(getField()); buffer.append(":"); } buffer.append(getTerm().text()); buffer.append('*'); buffer.append('~'); buffer.append(Integer.toString(maxEdits)); if (getFilter() != null) { buffer.append(","); buffer.append("filter"); buffer.append(getFilter().toString()); } return buffer.toString(); } private static class FuzzyCompletionWeight extends CompletionWeight { private final Set refs; int currentBoost = 0; public FuzzyCompletionWeight(CompletionQuery query, Automaton automaton, Set refs) throws IOException { super(query, automaton); this.refs = refs; } @Override protected void setNextMatch(IntsRef pathPrefix) { // NOTE: the last letter of the matched prefix for the exact // match never makes it through here // so an exact match and a match with only a edit at the // end is boosted the same int maxCount = 0; for (IntsRef ref : refs) { int minLength = Math.min(ref.length, pathPrefix.length); int count = 0; for (int i = 0; i < minLength; i++) { if (ref.ints[i + ref.offset] == pathPrefix.ints[i + pathPrefix.offset]) { count++; } else { break; } } maxCount = Math.max(maxCount, count); } currentBoost = maxCount; } @Override protected float boost() { return currentBoost; } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy