All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.dylon.liblevenshtein.levenshtein.LazyTransducerCollection Maven / Gradle / Ivy

There is a newer version: 2.2.2
Show newest version
// Generated by delombok at Mon Apr 04 11:50:59 PDT 2016
package com.github.dylon.liblevenshtein.levenshtein;

import java.util.ArrayDeque;
import java.util.Deque;
import java.util.Iterator;
import it.unimi.dsi.fastutil.chars.CharIterator;
import lombok.NonNull;
import com.github.dylon.liblevenshtein.collection.AbstractIterator;

/**
 * 

* The algorithm for imitating Levenshtein automata was taken from the * following journal article: *

*
 * 
 * {@literal @}ARTICLE {Schulz02faststring,
 * author = {Klaus Schulz and Stoyan Mihov},
 * title = {Fast String Correction with Levenshtein-Automata},
 * journal = {INTERNATIONAL JOURNAL OF DOCUMENT ANALYSIS AND RECOGNITION},
 * year = {2002},
 * volume = {5},
 * pages = {67--85}
 * }
 * 
 * 
*

* As well, this Master Thesis helped me understand its concepts: *

*
    *
  • www.fmi.uni-sofia.bg/fmi/logic/theses/mitankin-en.pdf
  • *
*

* The supervisor of the student who submitted the thesis was one of the authors * of the journal article, above. *

* * @param Kind of nodes of the dictionary automaton. * @param Kind of the spelling candidates returned from the * dictionary. * @author Dylon Edwards * @since 2.1.2 */ public class LazyTransducerCollection extends AbstractIterator implements ICandidateCollection { /** * Query term whose spelling should be corrected. */ @NonNull private final String term; /** * Maximum number of spelling errors candidates may have from the query term. */ private final int maxDistance; /** * Attributes required for this transducer to search the dictionary. */ @NonNull private final TransducerAttributes attributes; /** * Breadth-first traversal of the dictionary automaton. */ private final Deque> pendingQueue = new ArrayDeque<>(); /** * Transitions one state to another. */ private final IStateTransitionFunction stateTransition; /** * Helper variable used when determining the length of a characteristic * vector. */ private final int a; /** * Length of the next characteristic vector to return. */ private int k; /** * Offset of where to begin traversing the next characteristic vector. */ private int i; /** * Labels of the outgoing transitions for a dictionary state. */ private CharIterator labels = null; /** * Dictionary node represented by the current intersection between the * dictionary automaton and the Levenshtein automaton. * @see #levenshteinState * @see #candidate */ private DictionaryNode dictionaryNode = null; /** * Levenshtein state represented by the current intersection between the * dictionary automaton and the Levenshtein automaton. * @see #dictionaryNode * @see #candidate */ private IState levenshteinState = null; /** * Prefix of the dictionary, from its root to {@link #dictionaryNode}. * @see #dictionaryNode * @see #levenshteinState */ private String candidate = null; /** * Initializes a new LazyTransducerCollection with a query against the * dictionary automaton. * @param term Query term whose spelling should be corrected. * @param maxDistance Maximum number of spelling errors candidates may have * from the query term. * @param attributes Attributes required for this transducer to search the * dictionary. */ public LazyTransducerCollection(@NonNull final String term, final int maxDistance, @NonNull final TransducerAttributes attributes) { if (term == null) { throw new java.lang.IllegalArgumentException("term is null"); } if (attributes == null) { throw new java.lang.IllegalArgumentException("attributes is null"); } this.term = term; this.maxDistance = maxDistance; this.attributes = attributes; pendingQueue.addLast(attributes.intersectionFactory().build("", attributes.dictionaryRoot(), attributes.initialState())); this.stateTransition = attributes.stateTransitionFactory().build(maxDistance); // f(x) := x * 2 + 1 // a := (n - 1) / 2 // f(a) = (n - 1) / 2 * 2 + 1 = n - 1 + 1 = n // // We want to cap the value of "a" at "n = max integer" so "f(a)" does not // overflow, which it would if "a > (n - 1) / 2". In other words, define: // g(x) := { x * 2 + 1 , if a < (n - 1) / 2 // { n , otherwise // // If there was no upper bound for integer values, this would be equivalent: // h(x) := min {f(x), n} this.a = (maxDistance < (Integer.MAX_VALUE - 1) >> 1) ? (maxDistance << 1) + 1 : Integer.MAX_VALUE; } /** * {@inheritDoc} */ @Override public Iterator iterator() { return this; } /** * {@inheritDoc} */ @Override protected void advance() { while (null == next && (null != labels && labels.hasNext() || !pendingQueue.isEmpty())) { if (null != labels && labels.hasNext()) { final char label = labels.nextChar(); final DictionaryNode nextDictionaryNode = attributes.dictionaryTransition().of(dictionaryNode, label); final boolean[] characteristicVector = characteristicVector(label, term, k, i); final IState nextLevenshteinState = stateTransition.of(levenshteinState, characteristicVector); if (null != nextLevenshteinState) { final String nextCandidate = candidate + label; pendingQueue.addLast(attributes.intersectionFactory().build(nextCandidate, nextDictionaryNode, nextLevenshteinState)); if (attributes.isFinal().at(nextDictionaryNode)) { final int distance = attributes.minDistance().at(nextLevenshteinState, term.length()); if (distance <= maxDistance) { this.next = attributes.candidateFactory().build(nextCandidate, distance); } } } } else { final Intersection intersection = pendingQueue.removeFirst(); this.candidate = intersection.candidate(); this.dictionaryNode = intersection.dictionaryNode(); this.levenshteinState = intersection.levenshteinState(); this.i = levenshteinState.getOuter(0)[0]; final int b = term.length() - i; this.k = (a < b) ? a : b; this.labels = attributes.dictionaryTransition().of(dictionaryNode); } } } /** * Returns the characteristic vector of the term, from its characters between * index i and index k. The characteristic vector contains true at each index * where the corresponding character of the term is the value of x, and false * elsewhere. * @param x char to find all occurrences of in the relevant substring of term * @param term Term in which to find all occurrences of the character, x * @param k Length of the substring of term to examine * @param i Base-index of the substring of term to examine * @return Characteristic vector marking where x appears in the relevant * substring of term. */ private boolean[] characteristicVector(final char x, final String term, final int k, final int i) { final boolean[] characteristicVector = new boolean[k]; for (int j = 0; j < k; ++j) { characteristicVector[j] = (x == term.charAt(i + j)); } return characteristicVector; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy