All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.search.suggest.analyzing.FuzzySuggester Maven / Gradle / Ivy

There is a newer version: 10.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.suggest.analyzing;

import static org.apache.lucene.util.automaton.Operations.DEFAULT_DETERMINIZE_WORK_LIMIT;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; // javadocs
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.FiniteStringsIterator;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.UTF32ToUTF8;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PairOutputs.Pair;

/**
 * Implements a fuzzy {@link AnalyzingSuggester}. The similarity measurement is based on the
 * Damerau-Levenshtein (optimal string alignment) algorithm, though you can explicitly choose
 * classic Levenshtein by passing false for the transpositions parameter.
 *
 * 

At most, this query will match terms up to {@value * org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} edits. Higher * distances are not supported. Note that the fuzzy distance is measured in "byte space" on the * bytes returned by the {@link TokenStream}'s {@link TermToBytesRefAttribute}, usually UTF8. By * default the analyzed bytes must be at least 3 {@link #DEFAULT_MIN_FUZZY_LENGTH} bytes before any * edits are considered. Furthermore, the first 1 {@link #DEFAULT_NON_FUZZY_PREFIX} byte is not * allowed to be edited. We allow up to 1 (@link #DEFAULT_MAX_EDITS} edit. If {@link #unicodeAware} * parameter in the constructor is set to true, maxEdits, minFuzzyLength, transpositions and * nonFuzzyPrefix are measured in Unicode code points (actual letters) instead of bytes. * *

NOTE: This suggester does not boost suggestions that required no edits over suggestions that * did require edits. This is a known limitation. * *

Note: complex query analyzers can have a significant impact on the lookup performance. It's * recommended to not use analyzers that drop or inject terms like synonyms to keep the complexity * of the prefix intersection low for good lookup performance. At index time, complex analyzers can * safely be used. * * @lucene.experimental */ public final class FuzzySuggester extends AnalyzingSuggester { private final int maxEdits; private final boolean transpositions; private final int nonFuzzyPrefix; private final int minFuzzyLength; private final boolean unicodeAware; /** * Measure maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix parameters in Unicode code * points (actual letters) instead of bytes. */ public static final boolean DEFAULT_UNICODE_AWARE = false; /** * The default minimum length of the key passed to {@link #lookup} before any edits are allowed. */ public static final int DEFAULT_MIN_FUZZY_LENGTH = 3; /** The default prefix length where edits are not allowed. */ public static final int DEFAULT_NON_FUZZY_PREFIX = 1; /** The default maximum number of edits for fuzzy suggestions. */ public static final int DEFAULT_MAX_EDITS = 1; /** The default transposition value passed to {@link LevenshteinAutomata} */ public static final boolean DEFAULT_TRANSPOSITIONS = true; /** * Creates a {@link FuzzySuggester} instance initialized with default values. * * @param analyzer the analyzer used for this suggester */ public FuzzySuggester(Directory tempDir, String tempFileNamePrefix, Analyzer analyzer) { this(tempDir, tempFileNamePrefix, analyzer, analyzer); } /** * Creates a {@link FuzzySuggester} instance with an index and query analyzer initialized with * default values. * * @param indexAnalyzer Analyzer that will be used for analyzing suggestions while building the * index. * @param queryAnalyzer Analyzer that will be used for analyzing query text during lookup */ public FuzzySuggester( Directory tempDir, String tempFileNamePrefix, Analyzer indexAnalyzer, Analyzer queryAnalyzer) { this( tempDir, tempFileNamePrefix, indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true, DEFAULT_MAX_EDITS, DEFAULT_TRANSPOSITIONS, DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_UNICODE_AWARE); } /** * Creates a {@link FuzzySuggester} instance. * * @param indexAnalyzer Analyzer that will be used for analyzing suggestions while building the * index. * @param queryAnalyzer Analyzer that will be used for analyzing query text during lookup * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP} * @param maxSurfaceFormsPerAnalyzedForm Maximum number of surface forms to keep for a single * analyzed form. When there are too many surface forms we discard the lowest weighted ones. * @param maxGraphExpansions Maximum number of graph paths to expand from the analyzed form. Set * this to -1 for no limit. * @param preservePositionIncrements Whether position holes should appear in the automaton * @param maxEdits must be >= 0 and <= {@link * LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} . * @param transpositions true if transpositions should be treated as a primitive edit * operation. If this is false, comparisons will implement the classic Levenshtein algorithm. * @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link * #DEFAULT_NON_FUZZY_PREFIX} * @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default * {@link #DEFAULT_MIN_FUZZY_LENGTH}) * @param unicodeAware operate Unicode code points instead of bytes. */ public FuzzySuggester( Directory tempDir, String tempFileNamePrefix, Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, boolean preservePositionIncrements, int maxEdits, boolean transpositions, int nonFuzzyPrefix, int minFuzzyLength, boolean unicodeAware) { super( tempDir, tempFileNamePrefix, indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, preservePositionIncrements); if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { throw new IllegalArgumentException( "maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE); } if (nonFuzzyPrefix < 0) { throw new IllegalArgumentException( "nonFuzzyPrefix must not be >= 0 (got " + nonFuzzyPrefix + ")"); } if (minFuzzyLength < 0) { throw new IllegalArgumentException( "minFuzzyLength must not be >= 0 (got " + minFuzzyLength + ")"); } this.maxEdits = maxEdits; this.transpositions = transpositions; this.nonFuzzyPrefix = nonFuzzyPrefix; this.minFuzzyLength = minFuzzyLength; this.unicodeAware = unicodeAware; } @Override protected List>> getFullPrefixPaths( List>> prefixPaths, Automaton lookupAutomaton, FST> fst) throws IOException { // TODO: right now there's no penalty for fuzzy/edits, // ie a completion whose prefix matched exactly what the // user typed gets no boost over completions that // required an edit, which get no boost over completions // requiring two edits. I suspect a multiplicative // factor is appropriate (eg, say a fuzzy match must be at // least 2X better weight than the non-fuzzy match to // "compete") ... in which case I think the wFST needs // to be log weights or something ... Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton)); /* Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), StandardCharsets.UTF_8); w.write(levA.toDot()); w.close(); System.out.println("Wrote LevA to out.dot"); */ return FSTUtil.intersectPrefixPaths(levA, fst); } @Override protected Automaton convertAutomaton(Automaton a) { if (unicodeAware) { Automaton utf8automaton = new UTF32ToUTF8().convert(a); utf8automaton = Operations.determinize(utf8automaton, DEFAULT_DETERMINIZE_WORK_LIMIT); return utf8automaton; } else { return a; } } @Override TokenStreamToAutomaton getTokenStreamToAutomaton() { final TokenStreamToAutomaton tsta = super.getTokenStreamToAutomaton(); tsta.setUnicodeArcs(unicodeAware); return tsta; } Automaton toLevenshteinAutomata(Automaton automaton) { List subs = new ArrayList<>(); FiniteStringsIterator finiteStrings = new FiniteStringsIterator(automaton); for (IntsRef string; (string = finiteStrings.next()) != null; ) { if (string.length <= nonFuzzyPrefix || string.length < minFuzzyLength) { subs.add(Automata.makeString(string.ints, string.offset, string.length)); } else { int[] ints = new int[string.length - nonFuzzyPrefix]; System.arraycopy(string.ints, string.offset + nonFuzzyPrefix, ints, 0, ints.length); // TODO: maybe add alphaMin to LevenshteinAutomata, // and pass 1 instead of 0? We probably don't want // to allow the trailing dedup bytes to be // edited... but then 0 byte is "in general" allowed // on input (but not in UTF8). LevenshteinAutomata lev = new LevenshteinAutomata( ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions); subs.add( lev.toAutomaton( maxEdits, UnicodeUtil.newString(string.ints, string.offset, nonFuzzyPrefix))); } } if (subs.isEmpty()) { // automaton is empty, there is no accepted paths through it return Automata.makeEmpty(); // matches nothing } else if (subs.size() == 1) { // no synonyms or anything: just a single path through the tokenstream return subs.get(0); } else { // multiple paths: this is really scary! is it slow? // maybe we should not do this and throw UOE? Automaton a = Operations.union(subs); // TODO: we could call toLevenshteinAutomata() before det? // this only happens if you have multiple paths anyway (e.g. synonyms) return Operations.determinize(a, DEFAULT_DETERMINIZE_WORK_LIMIT); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy