All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.liblevenshtein.transducer.LazyTransducerCollection Maven / Gradle / Ivy

There is a newer version: 3.0.1
Show newest version
package com.github.liblevenshtein.transducer;

import com.github.liblevenshtein.collection.AbstractIterator;

import java.util.ArrayDeque;
import java.util.Deque;
import java.util.Iterator;

/**
 * 

* Lazy implementation of {@link Iterable}, which in subsequent invocations of * its {@link #iterator()} searches only enough of the dictionary automaton to * find the next spelling candidate. Subsequent calls to {@link Iterator#next()} * pick up where the search stopped. *

* *

* Please note that the {@link #iterator()} is not currently threadsafe, so if * you will be consuming the same {@link Iterator} from multiple threads you * will need to synchronize access yourself. *

* *

* The algorithm for imitating Levenshtein automata was taken from the * following journal article: *

* *
 * 
 * {@literal @}ARTICLE {Schulz02faststring,
 *   author = {Klaus Schulz and Stoyan Mihov},
 *   title = {Fast String Correction with Levenshtein-Automata},
 *   journal = {INTERNATIONAL JOURNAL OF DOCUMENT ANALYSIS AND RECOGNITION},
 *   year = {2002},
 *   volume = {5},
 *   pages = {67--85}
 * }
 * 
 * 
* *

* As well, this Master Thesis helped me understand its concepts: *

* *
    *
  • www.fmi.uni-sofia.bg/fmi/logic/theses/mitankin-en.pdf
  • *
* *

* The supervisor of the student who submitted the thesis was one of the authors * of the journal article, above. *

* * @author Dylon Edwards * @param Kind of nodes of the dictionary automaton. * @param Kind of the spelling candidates returned from the * dictionary. * @since 2.1.2 */ public class LazyTransducerCollection extends AbstractIterator implements Iterable { /** * Query term whose spelling should be corrected. */ private final String term; /** * Maximum number of spelling errors candidates may have from the query term. */ private final int maxDistance; /** * Attributes required for this transducer to search the dictionary. */ private final TransducerAttributes attributes; /** * Breadth-first traversal of the dictionary automaton. */ private final Deque> pendingQueue = new ArrayDeque<>(); /** * Transitions one state to another. */ private final StateTransitionFunction stateTransition; /** * Helper variable used when determining the length of a characteristic * vector. */ private final int a; /** * Length of the next characteristic vector to return. */ private int k; /** * Offset of where to begin traversing the next characteristic vector. */ private int i; /** * Labels of the outgoing transitions for a dictionary state. */ private Iterator labels = null; /** * Current intersection between the dictionary and Levenshtein automata. */ private Intersection intersection = null; /** * Initializes a new LazyTransducerCollection with a query against the * dictionary automaton. * @param term Query term whose spelling should be corrected. * @param maxDistance Maximum number of spelling errors candidates may have * from the query term. * @param attributes Attributes required for this transducer to search the * dictionary. */ public LazyTransducerCollection( final String term, final int maxDistance, final TransducerAttributes attributes) { this.term = term; this.maxDistance = maxDistance; this.attributes = attributes; pendingQueue.addLast( new Intersection( attributes.dictionaryRoot(), attributes.initialState())); this.stateTransition = attributes.stateTransitionFactory().build(maxDistance, term.length()); // f(x) := x * 2 + 1 // a := (n - 1) / 2 // f(a) = (n - 1) / 2 * 2 + 1 = n - 1 + 1 = n // // We want to cap the value of "a" at "n = max integer" so "f(a)" does not // overflow, which it would if "a > (n - 1) / 2". In other words, define: // g(x) := { x * 2 + 1 , if a < (n - 1) / 2 // { n , otherwise // // If there was no upper bound for integer values, this would be equivalent: // h(x) := min {f(x), n} this.a = maxDistance < (Integer.MAX_VALUE - 1) >> 1 ? (maxDistance << 1) + 1 : Integer.MAX_VALUE; } /** * {@inheritDoc} */ @Override public Iterator iterator() { return this; } /** * {@inheritDoc} */ @Override protected void advance() { while (null == next && (null != labels && labels.hasNext() || !pendingQueue.isEmpty())) { if (null != labels && labels.hasNext()) { final DictionaryNode dictionaryNode = intersection.dictionaryNode(); final State levenshteinState = intersection.levenshteinState(); final char label = labels.next(); final DictionaryNode nextDictionaryNode = attributes.dictionaryTransition().of(dictionaryNode, label); final boolean[] characteristicVector = characteristicVector(label, term, k, i); final State nextLevenshteinState = stateTransition.of(levenshteinState, characteristicVector); if (null != nextLevenshteinState) { final Intersection nextIntersection = new Intersection<>( intersection, label, nextDictionaryNode, nextLevenshteinState); pendingQueue.addLast(nextIntersection); if (attributes.isFinal().at(nextDictionaryNode)) { final int distance = attributes.minDistance().at(nextLevenshteinState, term.length()); if (distance <= maxDistance) { final String nextCandidate = nextIntersection.candidate(); this.next = attributes.candidateFactory().build(nextCandidate, distance); } } } } else { this.intersection = pendingQueue.removeFirst(); final DictionaryNode dictionaryNode = intersection.dictionaryNode(); final State levenshteinState = intersection.levenshteinState(); this.i = levenshteinState.head().termIndex(); final int b = term.length() - i; this.k = a < b ? a : b; this.labels = attributes.dictionaryTransition().of(dictionaryNode); } } } /** * Returns the characteristic vector of the term, from its characters between * index i and index k. The characteristic vector contains true at each index * where the corresponding character of the term is the value of x, and false * elsewhere. * @param x char to find all occurrences of in the relevant substring of term * @param term Term in which to find all occurrences of the character, x * @param k Length of the substring of term to examine * @param i Base-index of the substring of term to examine * @return Characteristic vector marking where x appears in the relevant * substring of term. */ private boolean[] characteristicVector( final char x, final String term, final int k, final int i) { final boolean[] characteristicVector = new boolean[k]; for (int j = 0; j < k; ++j) { characteristicVector[j] = x == term.charAt(i + j); } return characteristicVector; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy