morfologik.stemming.DictionaryLookup Maven / Gradle / Ivy
package morfologik.stemming;
import static morfologik.fsa.MatchResult.*;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.*;
import java.util.*;
import morfologik.fsa.*;
import morfologik.util.BufferUtils;
/**
* This class implements a dictionary lookup over an FSA dictionary. The
* dictionary for this class should be prepared from a text file using Jan
* Daciuk's FSA package (see link below).
*
*
* Important: finite state automatons in Jan Daciuk's implementation use
* bytes not unicode characters. Therefore objects of this class always
* have to be constructed with an encoding used to convert Java strings to byte
* arrays and the other way around. You can use UTF-8 encoding, as it
* should not conflict with any control sequences and separator characters.
*
* @see FSA package Web
* site
*/
public final class DictionaryLookup implements IStemmer, Iterable {
/** An FSA used for lookups. */
private final FSATraversal matcher;
/** An iterator for walking along the final states of {@link #fsa}. */
private final FSAFinalStatesIterator finalStatesIterator;
/** FSA's root node. */
private final int rootNode;
/** Expand buffers and arrays by this constant. */
private final static int EXPAND_SIZE = 10;
/** Private internal array of reusable word data objects. */
private WordData[] forms = new WordData[0];
/** A "view" over an array implementing */
private ArrayViewList formsList = new ArrayViewList(
forms, 0, forms.length);
/**
* Features of the compiled dictionary.
*
* @see DictionaryMetadata
*/
private final DictionaryMetadata dictionaryMetadata;
/**
* Charset encoder for the FSA.
*/
private final CharsetEncoder encoder;
/**
* Charset decoder for the FSA.
*/
private final CharsetDecoder decoder;
/**
* The FSA we are using.
*/
private final FSA fsa;
/**
* @see #getSeparatorChar()
*/
private final char separatorChar;
/**
* Internal reusable buffer for encoding words into byte arrays using
* {@link #encoder}.
*/
private ByteBuffer byteBuffer = ByteBuffer.allocate(0);
/**
* Internal reusable buffer for encoding words into byte arrays using
* {@link #encoder}.
*/
private CharBuffer charBuffer = CharBuffer.allocate(0);
/**
* Reusable match result.
*/
private final MatchResult matchResult = new MatchResult();
/**
* The {@link Dictionary} this lookup is using.
*/
private final Dictionary dictionary;
/**
*
* Creates a new object of this class using the given FSA for word lookups
* and encoding for converting characters to bytes.
*
* @throws IllegalArgumentException
* if FSA's root node cannot be acquired (dictionary is empty).
*/
public DictionaryLookup(Dictionary dictionary)
throws IllegalArgumentException {
this.dictionary = dictionary;
this.dictionaryMetadata = dictionary.metadata;
this.rootNode = dictionary.fsa.getRootNode();
this.fsa = dictionary.fsa;
this.matcher = new FSATraversal(fsa);
this.finalStatesIterator = new FSAFinalStatesIterator(fsa, fsa.getRootNode());
if (rootNode == 0) {
throw new IllegalArgumentException(
"Dictionary must have at least the root node.");
}
if (dictionaryMetadata == null) {
throw new IllegalArgumentException(
"Dictionary metadata must not be null.");
}
try {
Charset charset = Charset.forName(dictionaryMetadata.encoding);
encoder = charset.newEncoder();
decoder = charset.newDecoder().onMalformedInput(
CodingErrorAction.REPORT).onUnmappableCharacter(
CodingErrorAction.REPORT);
} catch (UnsupportedCharsetException e) {
throw new RuntimeException(
"FSA's encoding charset is not supported: "
+ dictionaryMetadata.encoding);
}
try {
CharBuffer decoded = decoder.decode(ByteBuffer.wrap(new byte [] { dictionaryMetadata.separator }));
if (decoded.remaining() != 1) {
throw new RuntimeException("FSA's separator byte takes more than one character after conversion "
+ " of byte 0x"
+ Integer.toHexString(dictionaryMetadata.separator) + " using encoding "
+ dictionaryMetadata.encoding);
}
this.separatorChar = decoded.get();
} catch (CharacterCodingException e) {
throw new RuntimeException(
"FSA's separator character cannot be decoded from byte value 0x"
+ Integer.toHexString(dictionaryMetadata.separator) + " using encoding "
+ dictionaryMetadata.encoding, e);
}
}
/**
* Searches the automaton for a symbol sequence equal to word
,
* followed by a separator. The result is a stem (decompressed accordingly
* to the dictionary's specification) and an optional tag data.
*/
public List lookup(CharSequence word) {
final byte separator = dictionaryMetadata.separator;
// Reset the output list to zero length.
formsList.wrap(forms, 0, 0);
// Encode word characters into bytes in the same encoding as the FSA's.
charBuffer.clear();
charBuffer = BufferUtils.ensureCapacity(charBuffer, word.length());
for (int i = 0; i < word.length(); i++) {
char chr = word.charAt(i);
if (chr == separatorChar)
return formsList;
charBuffer.put(chr);
}
charBuffer.flip();
byteBuffer = charsToBytes(charBuffer, byteBuffer);
// Try to find a partial match in the dictionary.
final MatchResult match = matcher.match(matchResult, byteBuffer
.array(), 0, byteBuffer.remaining(), rootNode);
if (match.kind == SEQUENCE_IS_A_PREFIX) {
/*
* The entire sequence exists in the dictionary. A separator should
* be the next symbol.
*/
final int arc = fsa.getArc(match.node, separator);
/*
* The situation when the arc points to a final node should NEVER
* happen. After all, we want the word to have SOME base form.
*/
if (arc != 0 && !fsa.isArcFinal(arc)) {
// There is such a word in the dictionary. Return its base forms.
int formsCount = 0;
finalStatesIterator.restartFrom(fsa.getEndNode(arc));
while (finalStatesIterator.hasNext()) {
final ByteBuffer bb = finalStatesIterator.next();
final byte[] ba = bb.array();
final int bbSize = bb.remaining();
if (formsCount >= forms.length) {
forms = Arrays.copyOf(forms, forms.length + EXPAND_SIZE);
for (int k = 0; k < forms.length; k++) {
if (forms[k] == null)
forms[k] = new WordData(decoder);
}
}
/*
* Now, expand the prefix/ suffix 'compression' and store
* the base form.
*/
final WordData wordData = forms[formsCount++];
wordData.reset();
wordData.wordBuffer = byteBuffer;
wordData.wordCharSequence = word;
/*
* Find the separator byte's position splitting the inflection instructions
* from the tag.
*/
int sepPos;
for (sepPos = 0; sepPos < bbSize; sepPos++) {
if (ba[sepPos] == separator)
break;
}
/*
* Decode the stem into stem buffer.
*/
wordData.stemBuffer.clear();
wordData.stemBuffer = decodeStem(wordData.stemBuffer, ba,
sepPos, byteBuffer, dictionaryMetadata);
wordData.stemBuffer.flip();
// Skip separator character.
sepPos++;
/*
* Decode the tag data.
*/
final int tagSize = bbSize - sepPos;
if (tagSize > 0) {
wordData.tagBuffer = BufferUtils.ensureCapacity(
wordData.tagBuffer, tagSize);
wordData.tagBuffer.clear();
wordData.tagBuffer.put(ba, sepPos, tagSize);
wordData.tagBuffer.flip();
}
}
formsList.wrap(forms, 0, formsCount);
}
} else {
/*
* this case is somewhat confusing: we should have hit the separator
* first... I don't really know how to deal with it at the time
* being.
*/
}
return formsList;
}
/**
* Decode the base form of an inflected word and save its decoded form into
* a byte buffer.
*
* @param bb
* The byte buffer to save the result to. A new buffer may be
* allocated if the capacity of bb
is not large
* enough to store the result. The buffer is not flipped upon
* return.
*
* @param inflectedBuffer
* Inflected form's bytes (decoded properly).
*
* @param bytes
* Bytes of the encoded base form, starting at 0 index.
*
* @param len
* Length of the encode base form.
*
* @return Returns either bb
or a new buffer whose capacity is
* large enough to store the output of the decoded data.
*/
public static ByteBuffer decodeStem(ByteBuffer bb, byte[] bytes, int len,
ByteBuffer inflectedBuffer, DictionaryMetadata metadata) {
bb.clear();
// Empty length? Weird, but return an empty buffer.
if (len == 0) {
return bb;
}
// Determine inflected string's length in bytes, in the same encoding.
final byte[] infBytes = inflectedBuffer.array();
final int infLen = inflectedBuffer.remaining();
final int code0 = bytes[0] - 'A';
final boolean fsaPrefixes = metadata.usesPrefixes;
final boolean fsaInfixes = metadata.usesInfixes;
// Increase buffer size, if needed.
if (bb.capacity() < infLen + len) {
bb = ByteBuffer.allocate(infLen + len);
}
if (code0 >= 0) {
if (!fsaPrefixes && !fsaInfixes) {
if (code0 <= infLen) {
bb.put(infBytes, 0, infLen - code0);
bb.put(bytes, 1, len - 1);
return bb;
}
} else if (fsaPrefixes && !fsaInfixes) {
if (len > 1) {
final int stripAtEnd = bytes[1] - 'A' + code0;
if (stripAtEnd <= infLen) {
bb.put(infBytes, code0, infLen - stripAtEnd);
bb.put(bytes, 2, len - 2);
return bb;
}
}
} else if (fsaInfixes) {
// Note: Prefixes are silently assumed here.
if (len > 2) {
final int stripAtBeginning = bytes[1] - 'A' + code0;
final int stripAtEnd = bytes[2] - 'A' + stripAtBeginning;
if (stripAtEnd <= infLen) {
bb.put(infBytes, 0, code0);
bb.put(infBytes, stripAtBeginning, infLen - stripAtEnd);
bb.put(bytes, 3, len - 3);
return bb;
}
}
}
}
/*
* This is a fallback in case some junk is detected above. Return the
* base form only if this is the case.
*/
bb.clear();
bb.put(bytes, 0, len);
return bb;
}
/**
* Encode a character sequence into a byte buffer, optionally expanding
* buffer.
*/
private ByteBuffer charsToBytes(CharBuffer chars, ByteBuffer bytes) {
bytes.clear();
final int maxCapacity = (int) (chars.remaining() * encoder
.maxBytesPerChar());
if (bytes.capacity() <= maxCapacity) {
bytes = ByteBuffer.allocate(maxCapacity);
}
chars.mark();
encoder.reset();
if (encoder.encode(chars, bytes, true).
isError()) { // remove everything, we don't want to accept malformed intput
bytes.clear();
}
bytes.flip();
chars.reset();
return bytes;
}
/**
* Return an iterator over all {@link WordData} entries available in the
* embedded {@link Dictionary}.
*/
public Iterator iterator() {
return new DictionaryIterator(dictionary, decoder, true);
}
/**
* @return Return the {@link Dictionary} used by this object.
*/
public Dictionary getDictionary() {
return dictionary;
}
/**
* @return Returns the logical separator character splitting inflected form,
* lemma correction token and a tag. Note that this character is a best-effort
* conversion from a byte in {@link DictionaryMetadata#separator} and
* may not be valid in the target encoding (although this is highly unlikely).
*/
public char getSeparatorChar() {
return separatorChar;
}
}