All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.analysis.hunspell.TrigramAutomaton Maven / Gradle / Ivy

There is a newer version: 8.11.4
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.hunspell;

import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.Operations;

/**
 * An automaton allowing to achieve the same results as non-weighted {@link
 * GeneratingSuggester#ngramScore}, but faster (in O(s2.length) time).
 */
class TrigramAutomaton {
  private static final int N = 3;
  private final CharacterRunAutomaton automaton;
  private final int[] state2Score;
  private final FixedBitSet countedSubstrings;
  private final char minChar;

  TrigramAutomaton(String s1) {
    Map substringCounts = new HashMap<>();

    Automaton.Builder builder = new Automaton.Builder(s1.length() * N, s1.length() * N);
    int initialState = builder.createState();

    minChar = (char) s1.chars().min().orElseThrow(AssertionError::new);

    for (int start = 0; start < s1.length(); start++) {
      int limit = Math.min(s1.length(), start + N);
      for (int end = start + 1; end <= limit; end++) {
        substringCounts.merge(s1.substring(start, end), 1, Integer::sum);
      }

      int state = initialState;
      for (int i = start; i < limit; i++) {
        int next = builder.createState();
        builder.addTransition(state, next, s1.charAt(i) - minChar);
        state = next;
      }
    }

    automaton =
        new CharacterRunAutomaton(
            Operations.determinize(builder.finish(), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT));

    state2Score = new int[automaton.getSize()];
    for (Map.Entry entry : substringCounts.entrySet()) {
      int state = runAutomatonOnStringChars(entry.getKey());
      assert state2Score[state] == 0;
      state2Score[state] = entry.getValue();
    }
    countedSubstrings = new FixedBitSet(state2Score.length);
  }

  private int runAutomatonOnStringChars(String s) {
    int state = 0;
    for (int i = 0; i < s.length(); i++) {
      state = automaton.step(state, s.charAt(i) - minChar);
    }
    return state;
  }

  int ngramScore(CharsRef s2) {
    countedSubstrings.clear(0, countedSubstrings.length());

    int score1 = 0, score2 = 0, score3 = 0; // scores for substrings of length 1, 2 and 3

    // states of running the automaton on substrings [i-1, i) and [i-2, i)
    int state1 = -1, state2 = -1;

    int limit = s2.length + s2.offset;
    for (int i = s2.offset; i < limit; i++) {
      char c = transformChar(s2.chars[i]);
      if (c < minChar) {
        state1 = state2 = -1;
        continue;
      }
      c -= minChar;

      int state3 = state2 <= 0 ? 0 : automaton.step(state2, c);
      if (state3 > 0) {
        score3 += substringScore(state3, countedSubstrings);
      }

      state2 = state1 <= 0 ? 0 : automaton.step(state1, c);
      if (state2 > 0) {
        score2 += substringScore(state2, countedSubstrings);
      }

      state1 = automaton.step(0, c);
      if (state1 > 0) {
        score1 += substringScore(state1, countedSubstrings);
      }
    }

    int score = score1;
    if (score1 >= 2) {
      score += score2;
      if (score2 >= 2) {
        score += score3;
      }
    }
    return score;
  }

  char transformChar(char c) {
    return c;
  }

  private int substringScore(int state, FixedBitSet countedSubstrings) {
    if (countedSubstrings.getAndSet(state)) return 0;

    int score = state2Score[state];
    assert score > 0;
    return score;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy