All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.search.FuzzyAutomatonBuilder Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.search;

import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;

/**
 * Builds a set of CompiledAutomaton for fuzzy matching on a given term,
 * with specified maximum edit distance, fixed prefix and whether or not
 * to allow transpositions.
 */
class FuzzyAutomatonBuilder {

  private final String term;
  private final int maxEdits;
  private final LevenshteinAutomata levBuilder;
  private final String prefix;
  private final int termLength;

  FuzzyAutomatonBuilder(String term, int maxEdits, int prefixLength, boolean transpositions) {
    if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
      throw new IllegalArgumentException("max edits must be 0.." + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + ", inclusive; got: " + maxEdits);
    }
    if (prefixLength < 0) {
      throw new IllegalArgumentException("prefixLength cannot be less than 0");
    }
    this.term = term;
    this.maxEdits = maxEdits;
    int[] codePoints = stringToUTF32(term);
    this.termLength = codePoints.length;
    prefixLength = Math.min(prefixLength, codePoints.length);
    int[] suffix = new int[codePoints.length - prefixLength];
    System.arraycopy(codePoints, prefixLength, suffix, 0, suffix.length);
    this.levBuilder = new LevenshteinAutomata(suffix, Character.MAX_CODE_POINT, transpositions);
    this.prefix = UnicodeUtil.newString(codePoints, 0, prefixLength);
  }

  CompiledAutomaton[] buildAutomatonSet() {
    CompiledAutomaton[] compiled = new CompiledAutomaton[maxEdits + 1];
    for (int i = 0; i <= maxEdits; i++) {
      try {
        compiled[i] = new CompiledAutomaton(levBuilder.toAutomaton(i, prefix), true, false);
      }
      catch (TooComplexToDeterminizeException e) {
        throw new FuzzyTermsEnum.FuzzyTermsException(term, e);
      }
    }
    return compiled;
  }

  CompiledAutomaton buildMaxEditAutomaton() {
    try {
      return new CompiledAutomaton(levBuilder.toAutomaton(maxEdits, prefix), true, false);
    } catch (TooComplexToDeterminizeException e) {
      throw new FuzzyTermsEnum.FuzzyTermsException(term, e);
    }
  }

  int getTermLength() {
    return this.termLength;
  }

  private static int[] stringToUTF32(String text) {
    int[] termText = new int[text.codePointCount(0, text.length())];
    for (int cp, i = 0, j = 0; i < text.length(); i += Character.charCount(cp)) {
      termText[j++] = cp = text.codePointAt(i);
    }
    return termText;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy