All Downloads are FREE. Search and download functionalities are using the official Maven repository.

morfologik.stemming.DictionaryLookup Maven / Gradle / Ivy

package morfologik.stemming;

import static morfologik.fsa.MatchResult.SEQUENCE_IS_A_PREFIX;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import morfologik.fsa.ByteSequenceIterator;
import morfologik.fsa.FSA;
import morfologik.fsa.FSATraversal;
import morfologik.fsa.MatchResult;

/**
 * This class implements a dictionary lookup over an FSA dictionary. The
 * dictionary for this class should be prepared from a text file using Jan
 * Daciuk's FSA package (see link below).
 * 
 * 

* Important: finite state automatons in Jan Daciuk's implementation use * bytes not unicode characters. Therefore objects of this class always * have to be constructed with an encoding used to convert Java strings to byte * arrays and the other way around. You can use UTF-8 encoding, as it * should not conflict with any control sequences and separator characters. * * @see FSA package Web * site */ public final class DictionaryLookup implements IStemmer, Iterable { /** An FSA used for lookups. */ private final FSATraversal matcher; /** An iterator for walking along the final states of {@link #fsa}. */ private final ByteSequenceIterator finalStatesIterator; /** FSA's root node. */ private final int rootNode; /** Expand buffers and arrays by this constant. */ private final static int EXPAND_SIZE = 10; /** Private internal array of reusable word data objects. */ private WordData[] forms = new WordData[0]; /** A "view" over an array implementing */ private final ArrayViewList formsList = new ArrayViewList( forms, 0, forms.length); /** * Features of the compiled dictionary. * * @see DictionaryMetadata */ private final DictionaryMetadata dictionaryMetadata; /** * Charset encoder for the FSA. */ private final CharsetEncoder encoder; /** * Charset decoder for the FSA. */ private final CharsetDecoder decoder; /** * The FSA we are using. */ private final FSA fsa; /** * @see #getSeparatorChar() */ private final char separatorChar; /** * Internal reusable buffer for encoding words into byte arrays using * {@link #encoder}. */ private ByteBuffer byteBuffer = ByteBuffer.allocate(0); /** * Internal reusable buffer for encoding words into byte arrays using * {@link #encoder}. */ private CharBuffer charBuffer = CharBuffer.allocate(0); /** * Reusable match result. */ private final MatchResult matchResult = new MatchResult(); /** * The {@link Dictionary} this lookup is using. */ private final Dictionary dictionary; private final ISequenceEncoder sequenceEncoder; /** * Creates a new object of this class using the given FSA for word lookups * and encoding for converting characters to bytes. * * @param dictionary The dictionary to use for lookups. * @throws IllegalArgumentException * if FSA's root node cannot be acquired (dictionary is empty). */ public DictionaryLookup(Dictionary dictionary) throws IllegalArgumentException { this.dictionary = dictionary; this.dictionaryMetadata = dictionary.metadata; this.sequenceEncoder = dictionary.metadata.getSequenceEncoderType().get(); this.rootNode = dictionary.fsa.getRootNode(); this.fsa = dictionary.fsa; this.matcher = new FSATraversal(fsa); this.finalStatesIterator = new ByteSequenceIterator(fsa, fsa.getRootNode()); if (dictionaryMetadata == null) { throw new IllegalArgumentException( "Dictionary metadata must not be null."); } decoder = dictionary.metadata.getDecoder(); encoder = dictionary.metadata.getEncoder(); separatorChar = dictionary.metadata.getSeparatorAsChar(); } /** * Searches the automaton for a symbol sequence equal to word, * followed by a separator. The result is a stem (decompressed accordingly * to the dictionary's specification) and an optional tag data. */ @Override public List lookup(CharSequence word) { final byte separator = dictionaryMetadata.getSeparator(); if (!dictionaryMetadata.getInputConversionPairs().isEmpty()) { word = applyReplacements(word, dictionaryMetadata.getInputConversionPairs()); } // Reset the output list to zero length. formsList.wrap(forms, 0, 0); // Encode word characters into bytes in the same encoding as the FSA's. charBuffer.clear(); charBuffer = BufferUtils.ensureCapacity(charBuffer, word.length()); for (int i = 0; i < word.length(); i++) { char chr = word.charAt(i); if (chr == separatorChar) return formsList; charBuffer.put(chr); } charBuffer.flip(); byteBuffer = charsToBytes(charBuffer, byteBuffer); // Try to find a partial match in the dictionary. final MatchResult match = matcher.match(matchResult, byteBuffer .array(), 0, byteBuffer.remaining(), rootNode); if (match.kind == SEQUENCE_IS_A_PREFIX) { /* * The entire sequence exists in the dictionary. A separator should * be the next symbol. */ final int arc = fsa.getArc(match.node, separator); /* * The situation when the arc points to a final node should NEVER * happen. After all, we want the word to have SOME base form. */ if (arc != 0 && !fsa.isArcFinal(arc)) { // There is such a word in the dictionary. Return its base forms. int formsCount = 0; finalStatesIterator.restartFrom(fsa.getEndNode(arc)); while (finalStatesIterator.hasNext()) { final ByteBuffer bb = finalStatesIterator.next(); final byte[] ba = bb.array(); final int bbSize = bb.remaining(); if (formsCount >= forms.length) { forms = Arrays.copyOf(forms, forms.length + EXPAND_SIZE); for (int k = 0; k < forms.length; k++) { if (forms[k] == null) forms[k] = new WordData(decoder); } } /* * Now, expand the prefix/ suffix 'compression' and store * the base form. */ final WordData wordData = forms[formsCount++]; if (dictionaryMetadata.getOutputConversionPairs().isEmpty()) { wordData.update(byteBuffer, word); } else { wordData.update(byteBuffer, applyReplacements(word, dictionaryMetadata.getOutputConversionPairs())); } /* * Find the separator byte's position splitting the inflection instructions * from the tag. */ int sepPos; for (sepPos = 0; sepPos < bbSize; sepPos++) { if (ba[sepPos] == separator) { break; } } /* * Decode the stem into stem buffer. */ wordData.stemBuffer = sequenceEncoder.decode(wordData.stemBuffer, byteBuffer, ByteBuffer.wrap(ba, 0, sepPos)); // Skip separator character. sepPos++; /* * Decode the tag data. */ final int tagSize = bbSize - sepPos; if (tagSize > 0) { wordData.tagBuffer = BufferUtils.ensureCapacity(wordData.tagBuffer, tagSize); wordData.tagBuffer.clear(); wordData.tagBuffer.put(ba, sepPos, tagSize); wordData.tagBuffer.flip(); } } formsList.wrap(forms, 0, formsCount); } } else { /* * this case is somewhat confusing: we should have hit the separator * first... I don't really know how to deal with it at the time * being. */ } return formsList; } /** * Apply partial string replacements from a given map. * * Useful if the word needs to be normalized somehow (i.e., ligatures, * apostrophes and such). * * @param word The word to apply replacements to. * @param replacements A map of replacements (from->to). * @return Returns a new string with all replacements applied. */ public static String applyReplacements(CharSequence word, LinkedHashMap replacements) { // quite horrible from performance point of view; this should really be a transducer. StringBuilder sb = new StringBuilder(word); for (final Map.Entry e : replacements.entrySet()) { String key = e.getKey(); int index = sb.indexOf(e.getKey()); while (index != -1) { sb.replace(index, index + key.length(), e.getValue()); index = sb.indexOf(key, index + key.length()); } } return sb.toString(); } /** * Encode a character sequence into a byte buffer, optionally expanding * buffer. */ private ByteBuffer charsToBytes(CharBuffer chars, ByteBuffer bytes) { bytes.clear(); final int maxCapacity = (int) (chars.remaining() * encoder .maxBytesPerChar()); if (bytes.capacity() <= maxCapacity) { bytes = ByteBuffer.allocate(maxCapacity); } chars.mark(); encoder.reset(); if (encoder.encode(chars, bytes, true).isError()) { // remove everything, we don't want to accept malformed input bytes.clear(); } bytes.flip(); chars.reset(); return bytes; } /** * Return an iterator over all {@link WordData} entries available in the * embedded {@link Dictionary}. */ @Override public Iterator iterator() { return new DictionaryIterator(dictionary, decoder, true); } /** * @return Return the {@link Dictionary} used by this object. */ public Dictionary getDictionary() { return dictionary; } /** * @return Returns the logical separator character splitting inflected form, * lemma correction token and a tag. Note that this character is a best-effort * conversion from a byte in {@link DictionaryMetadata#separator} and * may not be valid in the target encoding (although this is highly unlikely). */ public char getSeparatorChar() { return separatorChar; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy