morfologik.stemming.DictionaryLookup Maven / Gradle / Ivy
package morfologik.stemming;
import static morfologik.fsa.MatchResult.SEQUENCE_IS_A_PREFIX;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import morfologik.fsa.ByteSequenceIterator;
import morfologik.fsa.FSA;
import morfologik.fsa.FSATraversal;
import morfologik.fsa.MatchResult;
/**
* This class implements a dictionary lookup over an FSA dictionary. The
* dictionary for this class should be prepared from a text file using Jan
* Daciuk's FSA package (see link below).
*
*
* Important: finite state automatons in Jan Daciuk's implementation use
* bytes not unicode characters. Therefore objects of this class always
* have to be constructed with an encoding used to convert Java strings to byte
* arrays and the other way around. You can use UTF-8 encoding, as it
* should not conflict with any control sequences and separator characters.
*
* @see FSA package Web
* site
*/
public final class DictionaryLookup implements IStemmer, Iterable {
/** An FSA used for lookups. */
private final FSATraversal matcher;
/** An iterator for walking along the final states of {@link #fsa}. */
private final ByteSequenceIterator finalStatesIterator;
/** FSA's root node. */
private final int rootNode;
/** Expand buffers and arrays by this constant. */
private final static int EXPAND_SIZE = 10;
/** Private internal array of reusable word data objects. */
private WordData[] forms = new WordData[0];
/** A "view" over an array implementing */
private final ArrayViewList formsList = new ArrayViewList(
forms, 0, forms.length);
/**
* Features of the compiled dictionary.
*
* @see DictionaryMetadata
*/
private final DictionaryMetadata dictionaryMetadata;
/**
* Charset encoder for the FSA.
*/
private final CharsetEncoder encoder;
/**
* Charset decoder for the FSA.
*/
private final CharsetDecoder decoder;
/**
* The FSA we are using.
*/
private final FSA fsa;
/**
* @see #getSeparatorChar()
*/
private final char separatorChar;
/**
* Internal reusable buffer for encoding words into byte arrays using
* {@link #encoder}.
*/
private ByteBuffer byteBuffer = ByteBuffer.allocate(0);
/**
* Internal reusable buffer for encoding words into byte arrays using
* {@link #encoder}.
*/
private CharBuffer charBuffer = CharBuffer.allocate(0);
/**
* Reusable match result.
*/
private final MatchResult matchResult = new MatchResult();
/**
* The {@link Dictionary} this lookup is using.
*/
private final Dictionary dictionary;
private final ISequenceEncoder sequenceEncoder;
/**
* Creates a new object of this class using the given FSA for word lookups
* and encoding for converting characters to bytes.
*
* @param dictionary The dictionary to use for lookups.
* @throws IllegalArgumentException
* if FSA's root node cannot be acquired (dictionary is empty).
*/
public DictionaryLookup(Dictionary dictionary)
throws IllegalArgumentException {
this.dictionary = dictionary;
this.dictionaryMetadata = dictionary.metadata;
this.sequenceEncoder = dictionary.metadata.getSequenceEncoderType().get();
this.rootNode = dictionary.fsa.getRootNode();
this.fsa = dictionary.fsa;
this.matcher = new FSATraversal(fsa);
this.finalStatesIterator = new ByteSequenceIterator(fsa, fsa.getRootNode());
if (dictionaryMetadata == null) {
throw new IllegalArgumentException(
"Dictionary metadata must not be null.");
}
decoder = dictionary.metadata.getDecoder();
encoder = dictionary.metadata.getEncoder();
separatorChar = dictionary.metadata.getSeparatorAsChar();
}
/**
* Searches the automaton for a symbol sequence equal to word
,
* followed by a separator. The result is a stem (decompressed accordingly
* to the dictionary's specification) and an optional tag data.
*/
@Override
public List lookup(CharSequence word) {
final byte separator = dictionaryMetadata.getSeparator();
if (!dictionaryMetadata.getInputConversionPairs().isEmpty()) {
word = applyReplacements(word, dictionaryMetadata.getInputConversionPairs());
}
// Reset the output list to zero length.
formsList.wrap(forms, 0, 0);
// Encode word characters into bytes in the same encoding as the FSA's.
charBuffer.clear();
charBuffer = BufferUtils.ensureCapacity(charBuffer, word.length());
for (int i = 0; i < word.length(); i++) {
char chr = word.charAt(i);
if (chr == separatorChar)
return formsList;
charBuffer.put(chr);
}
charBuffer.flip();
byteBuffer = charsToBytes(charBuffer, byteBuffer);
// Try to find a partial match in the dictionary.
final MatchResult match = matcher.match(matchResult, byteBuffer
.array(), 0, byteBuffer.remaining(), rootNode);
if (match.kind == SEQUENCE_IS_A_PREFIX) {
/*
* The entire sequence exists in the dictionary. A separator should
* be the next symbol.
*/
final int arc = fsa.getArc(match.node, separator);
/*
* The situation when the arc points to a final node should NEVER
* happen. After all, we want the word to have SOME base form.
*/
if (arc != 0 && !fsa.isArcFinal(arc)) {
// There is such a word in the dictionary. Return its base forms.
int formsCount = 0;
finalStatesIterator.restartFrom(fsa.getEndNode(arc));
while (finalStatesIterator.hasNext()) {
final ByteBuffer bb = finalStatesIterator.next();
final byte[] ba = bb.array();
final int bbSize = bb.remaining();
if (formsCount >= forms.length) {
forms = Arrays.copyOf(forms, forms.length + EXPAND_SIZE);
for (int k = 0; k < forms.length; k++) {
if (forms[k] == null)
forms[k] = new WordData(decoder);
}
}
/*
* Now, expand the prefix/ suffix 'compression' and store
* the base form.
*/
final WordData wordData = forms[formsCount++];
if (dictionaryMetadata.getOutputConversionPairs().isEmpty()) {
wordData.update(byteBuffer, word);
} else {
wordData.update(byteBuffer, applyReplacements(word, dictionaryMetadata.getOutputConversionPairs()));
}
/*
* Find the separator byte's position splitting the inflection instructions
* from the tag.
*/
int sepPos;
for (sepPos = 0; sepPos < bbSize; sepPos++) {
if (ba[sepPos] == separator) {
break;
}
}
/*
* Decode the stem into stem buffer.
*/
wordData.stemBuffer = sequenceEncoder.decode(wordData.stemBuffer,
byteBuffer,
ByteBuffer.wrap(ba, 0, sepPos));
// Skip separator character.
sepPos++;
/*
* Decode the tag data.
*/
final int tagSize = bbSize - sepPos;
if (tagSize > 0) {
wordData.tagBuffer = BufferUtils.ensureCapacity(wordData.tagBuffer, tagSize);
wordData.tagBuffer.clear();
wordData.tagBuffer.put(ba, sepPos, tagSize);
wordData.tagBuffer.flip();
}
}
formsList.wrap(forms, 0, formsCount);
}
} else {
/*
* this case is somewhat confusing: we should have hit the separator
* first... I don't really know how to deal with it at the time
* being.
*/
}
return formsList;
}
/**
* Apply partial string replacements from a given map.
*
* Useful if the word needs to be normalized somehow (i.e., ligatures,
* apostrophes and such).
*
* @param word The word to apply replacements to.
* @param replacements A map of replacements (from->to).
* @return Returns a new string with all replacements applied.
*/
public static String applyReplacements(CharSequence word, LinkedHashMap replacements) {
// quite horrible from performance point of view; this should really be a transducer.
StringBuilder sb = new StringBuilder(word);
for (final Map.Entry e : replacements.entrySet()) {
String key = e.getKey();
int index = sb.indexOf(e.getKey());
while (index != -1) {
sb.replace(index, index + key.length(), e.getValue());
index = sb.indexOf(key, index + key.length());
}
}
return sb.toString();
}
/**
* Encode a character sequence into a byte buffer, optionally expanding
* buffer.
*/
private ByteBuffer charsToBytes(CharBuffer chars, ByteBuffer bytes) {
bytes.clear();
final int maxCapacity = (int) (chars.remaining() * encoder
.maxBytesPerChar());
if (bytes.capacity() <= maxCapacity) {
bytes = ByteBuffer.allocate(maxCapacity);
}
chars.mark();
encoder.reset();
if (encoder.encode(chars, bytes, true).isError()) {
// remove everything, we don't want to accept malformed input
bytes.clear();
}
bytes.flip();
chars.reset();
return bytes;
}
/**
* Return an iterator over all {@link WordData} entries available in the
* embedded {@link Dictionary}.
*/
@Override
public Iterator iterator() {
return new DictionaryIterator(dictionary, decoder, true);
}
/**
* @return Return the {@link Dictionary} used by this object.
*/
public Dictionary getDictionary() {
return dictionary;
}
/**
* @return Returns the logical separator character splitting inflected form,
* lemma correction token and a tag. Note that this character is a best-effort
* conversion from a byte in {@link DictionaryMetadata#separator} and
* may not be valid in the target encoding (although this is highly unlikely).
*/
public char getSeparatorChar() {
return separatorChar;
}
}