org.apache.lucene.search.suggest.analyzing.FuzzySuggester Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-suggest Show documentation
Apache Lucene (module: suggest)
There is a newer version: 10.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.suggest.analyzing;

import static org.apache.lucene.util.automaton.Operations.DEFAULT_DETERMINIZE_WORK_LIMIT;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; // javadocs
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.FiniteStringsIterator;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.UTF32ToUTF8;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PairOutputs.Pair;

/**
 * Implements a fuzzy {@link AnalyzingSuggester}. The similarity measurement is based on the
 * Damerau-Levenshtein (optimal string alignment) algorithm, though you can explicitly choose
 * classic Levenshtein by passing false for the transpositions parameter.
 *
 * At most, this query will match terms up to {@value
 * org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} edits. Higher
 * distances are not supported. Note that the fuzzy distance is measured in "byte space" on the
 * bytes returned by the {@link TokenStream}'s {@link TermToBytesRefAttribute}, usually UTF8. By
 * default the analyzed bytes must be at least 3 {@link #DEFAULT_MIN_FUZZY_LENGTH} bytes before any
 * edits are considered. Furthermore, the first 1 {@link #DEFAULT_NON_FUZZY_PREFIX} byte is not
 * allowed to be edited. We allow up to 1 (@link #DEFAULT_MAX_EDITS} edit. If {@link #unicodeAware}
 * parameter in the constructor is set to true, maxEdits, minFuzzyLength, transpositions and
 * nonFuzzyPrefix are measured in Unicode code points (actual letters) instead of bytes.
 *
 * 
NOTE: This suggester does not boost suggestions that required no edits over suggestions that
 * did require edits. This is a known limitation.
 *
 * Note: complex query analyzers can have a significant impact on the lookup performance. It's
 * recommended to not use analyzers that drop or inject terms like synonyms to keep the complexity
 * of the prefix intersection low for good lookup performance. At index time, complex analyzers can
 * safely be used.
 *
 * @lucene.experimental
 */
public final class FuzzySuggester extends AnalyzingSuggester {
  private final int maxEdits;
  private final boolean transpositions;
  private final int nonFuzzyPrefix;
  private final int minFuzzyLength;
  private final boolean unicodeAware;

  /**
   * Measure maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix parameters in Unicode code
   * points (actual letters) instead of bytes.
   */
  public static final boolean DEFAULT_UNICODE_AWARE = false;

  /**
   * The default minimum length of the key passed to {@link #lookup} before any edits are allowed.
   */
  public static final int DEFAULT_MIN_FUZZY_LENGTH = 3;

  /** The default prefix length where edits are not allowed. */
  public static final int DEFAULT_NON_FUZZY_PREFIX = 1;

  /** The default maximum number of edits for fuzzy suggestions. */
  public static final int DEFAULT_MAX_EDITS = 1;

  /** The default transposition value passed to {@link LevenshteinAutomata} */
  public static final boolean DEFAULT_TRANSPOSITIONS = true;

  /**
   * Creates a {@link FuzzySuggester} instance initialized with default values.
   *
   * @param analyzer the analyzer used for this suggester
   */
  public FuzzySuggester(Directory tempDir, String tempFileNamePrefix, Analyzer analyzer) {
    this(tempDir, tempFileNamePrefix, analyzer, analyzer);
  }

  /**
   * Creates a {@link FuzzySuggester} instance with an index and query analyzer initialized with
   * default values.
   *
   * @param indexAnalyzer Analyzer that will be used for analyzing suggestions while building the
   *     index.
   * @param queryAnalyzer Analyzer that will be used for analyzing query text during lookup
   */
  public FuzzySuggester(
      Directory tempDir,
      String tempFileNamePrefix,
      Analyzer indexAnalyzer,
      Analyzer queryAnalyzer) {
    this(
        tempDir,
        tempFileNamePrefix,
        indexAnalyzer,
        queryAnalyzer,
        EXACT_FIRST | PRESERVE_SEP,
        256,
        -1,
        true,
        DEFAULT_MAX_EDITS,
        DEFAULT_TRANSPOSITIONS,
        DEFAULT_NON_FUZZY_PREFIX,
        DEFAULT_MIN_FUZZY_LENGTH,
        DEFAULT_UNICODE_AWARE);
  }

  /**
   * Creates a {@link FuzzySuggester} instance.
   *
   * @param indexAnalyzer Analyzer that will be used for analyzing suggestions while building the
   *     index.
   * @param queryAnalyzer Analyzer that will be used for analyzing query text during lookup
   * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}
   * @param maxSurfaceFormsPerAnalyzedForm Maximum number of surface forms to keep for a single
   *     analyzed form. When there are too many surface forms we discard the lowest weighted ones.
   * @param maxGraphExpansions Maximum number of graph paths to expand from the analyzed form. Set
   *     this to -1 for no limit.
   * @param preservePositionIncrements Whether position holes should appear in the automaton
   * @param maxEdits must be >= 0 and <= {@link
   *     LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} .
   * @param transpositions true if transpositions should be treated as a primitive edit
   *     operation. If this is false, comparisons will implement the classic Levenshtein algorithm.
   * @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link
   *     #DEFAULT_NON_FUZZY_PREFIX}
   * @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default
   *     {@link #DEFAULT_MIN_FUZZY_LENGTH})
   * @param unicodeAware operate Unicode code points instead of bytes.
   */
  public FuzzySuggester(
      Directory tempDir,
      String tempFileNamePrefix,
      Analyzer indexAnalyzer,
      Analyzer queryAnalyzer,
      int options,
      int maxSurfaceFormsPerAnalyzedForm,
      int maxGraphExpansions,
      boolean preservePositionIncrements,
      int maxEdits,
      boolean transpositions,
      int nonFuzzyPrefix,
      int minFuzzyLength,
      boolean unicodeAware) {
    super(
        tempDir,
        tempFileNamePrefix,
        indexAnalyzer,
        queryAnalyzer,
        options,
        maxSurfaceFormsPerAnalyzedForm,
        maxGraphExpansions,
        preservePositionIncrements);
    if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
      throw new IllegalArgumentException(
          "maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
    }
    if (nonFuzzyPrefix < 0) {
      throw new IllegalArgumentException(
          "nonFuzzyPrefix must not be >= 0 (got " + nonFuzzyPrefix + ")");
    }
    if (minFuzzyLength < 0) {
      throw new IllegalArgumentException(
          "minFuzzyLength must not be >= 0 (got " + minFuzzyLength + ")");
    }

    this.maxEdits = maxEdits;
    this.transpositions = transpositions;
    this.nonFuzzyPrefix = nonFuzzyPrefix;
    this.minFuzzyLength = minFuzzyLength;
    this.unicodeAware = unicodeAware;
  }

  @Override
  protected List>> getFullPrefixPaths(
      List>> prefixPaths,
      Automaton lookupAutomaton,
      FST> fst)
      throws IOException {

    // TODO: right now there's no penalty for fuzzy/edits,
    // ie a completion whose prefix matched exactly what the
    // user typed gets no boost over completions that
    // required an edit, which get no boost over completions
    // requiring two edits.  I suspect a multiplicative
    // factor is appropriate (eg, say a fuzzy match must be at
    // least 2X better weight than the non-fuzzy match to
    // "compete") ... in which case I think the wFST needs
    // to be log weights or something ...

    Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton));
    /*
      Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), StandardCharsets.UTF_8);
      w.write(levA.toDot());
      w.close();
      System.out.println("Wrote LevA to out.dot");
    */
    return FSTUtil.intersectPrefixPaths(levA, fst);
  }

  @Override
  protected Automaton convertAutomaton(Automaton a) {
    if (unicodeAware) {
      Automaton utf8automaton = new UTF32ToUTF8().convert(a);
      utf8automaton = Operations.determinize(utf8automaton, DEFAULT_DETERMINIZE_WORK_LIMIT);
      return utf8automaton;
    } else {
      return a;
    }
  }

  @Override
  TokenStreamToAutomaton getTokenStreamToAutomaton() {
    final TokenStreamToAutomaton tsta = super.getTokenStreamToAutomaton();
    tsta.setUnicodeArcs(unicodeAware);
    return tsta;
  }

  Automaton toLevenshteinAutomata(Automaton automaton) {
    List subs = new ArrayList<>();
    FiniteStringsIterator finiteStrings = new FiniteStringsIterator(automaton);
    for (IntsRef string; (string = finiteStrings.next()) != null; ) {
      if (string.length <= nonFuzzyPrefix || string.length < minFuzzyLength) {
        subs.add(Automata.makeString(string.ints, string.offset, string.length));
      } else {
        int[] ints = new int[string.length - nonFuzzyPrefix];
        System.arraycopy(string.ints, string.offset + nonFuzzyPrefix, ints, 0, ints.length);
        // TODO: maybe add alphaMin to LevenshteinAutomata,
        // and pass 1 instead of 0?  We probably don't want
        // to allow the trailing dedup bytes to be
        // edited... but then 0 byte is "in general" allowed
        // on input (but not in UTF8).
        LevenshteinAutomata lev =
            new LevenshteinAutomata(
                ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions);
        subs.add(
            lev.toAutomaton(
                maxEdits, UnicodeUtil.newString(string.ints, string.offset, nonFuzzyPrefix)));
      }
    }

    if (subs.isEmpty()) {
      // automaton is empty, there is no accepted paths through it
      return Automata.makeEmpty(); // matches nothing
    } else if (subs.size() == 1) {
      // no synonyms or anything: just a single path through the tokenstream
      return subs.get(0);
    } else {
      // multiple paths: this is really scary! is it slow?
      // maybe we should not do this and throw UOE?
      Automaton a = Operations.union(subs);
      // TODO: we could call toLevenshteinAutomata() before det?
      // this only happens if you have multiple paths anyway (e.g. synonyms)
      return Operations.determinize(a, DEFAULT_DETERMINIZE_WORK_LIMIT);
    }
  }
}