
morfologik.tools.InflectionFramesTool Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of morfologik-stemming Show documentation
Show all versions of morfologik-stemming Show documentation
Morfologik provides high quality lemmatisation for the Polish language,
along with tools for building and using byte-based finite state automata.
package morfologik.tools;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.*;
import java.util.*;
import java.util.Map.Entry;
import morfologik.stemming.*;
import morfologik.stemming.Dictionary;
/**
* Calculate inflection frames from the Polish dictionary.
*/
public class InflectionFramesTool {
public static void main(String[] args) throws IOException {
new InflectionFramesTool().inflectionFrames();
}
/* */
@SuppressWarnings( { "unused" })
public void inflectionFrames() throws IOException {
final Dictionary pl = Dictionary.getForLanguage("pl");
final DictionaryLookup dict = new DictionaryLookup(pl);
final CharsetDecoder decoder = Charset.forName(pl.metadata.encoding)
.newDecoder().onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
final HashMap> forms =
new HashMap>();
ByteBuffer stemBuffer = ByteBuffer.allocate(0);
ByteBuffer inflBuffer = ByteBuffer.allocate(0);
ByteBuffer stemDecoded = ByteBuffer.allocate(0);
int limit = Integer.MAX_VALUE;
final Iterator i = new DictionaryIterator(pl, decoder, false);
while (i.hasNext() && limit-- > 0) {
final WordData wd = i.next();
final CharSequence inflected = wd.getWord();
final CharSequence stemEncoded = wd.getStem();
final CharSequence tag = wd.getTag();
if (tag == null)
continue;
inflBuffer.clear();
inflBuffer = wd.getWordBytes(inflBuffer);
stemBuffer.clear();
stemBuffer = wd.getStemBytes(stemBuffer);
stemDecoded = DictionaryLookup.decodeStem(stemDecoded, stemBuffer
.array(), stemBuffer.remaining(), inflBuffer, pl.metadata);
stemDecoded.flip();
final String stem = decoder.decode(stemDecoded).toString();
final String form = tag.toString().intern();
ArrayList frames = forms.get(stem);
if (frames == null) {
forms.put(stem, frames = new ArrayList());
}
if (!frames.contains(form)) {
frames.add(form);
}
}
// Sort the forms so that we get a unique key. Then iteratively add them
// to another hash (by form this time).
final HashMap> frames =
new HashMap>();
StringBuilder key = new StringBuilder();
for (Map.Entry> e : forms.entrySet()) {
Collections.sort(e.getValue());
key.setLength(0);
for (String s : e.getValue())
key.append(s).append(" ");
final String k = key.toString();
ArrayList words = frames.get(k);
if (words == null) {
frames.put(k, words = new ArrayList());
}
words.add(e.getKey());
e.setValue(null);
}
// Print inflection frames.
ArrayList>> entries =
new ArrayList>>();
entries.addAll(frames.entrySet());
Collections.sort(entries,
new Comparator>>() {
public int compare(Entry> o1,
Entry> o2) {
return o2.getValue().size() - o1.getValue().size();
}
});
for (Map.Entry> e : entries) {
System.out.println(String.format("%6d %s %s",
e.getValue().size(), e.getKey(), e.getValue()));
}
System.out.println("Total frames: " + frames.size());
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy