All Downloads are FREE. Search and download functionalities are using the official Maven repository.

morfologik.tools.InflectionFramesTool Maven / Gradle / Ivy

Go to download

Morfologik provides high quality lemmatisation for the Polish language, along with tools for building and using byte-based finite state automata.

There is a newer version: 2.1.9
Show newest version
package morfologik.tools;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.*;
import java.util.*;
import java.util.Map.Entry;

import morfologik.stemming.*;
import morfologik.stemming.Dictionary;

/**
 * Calculate inflection frames from the Polish dictionary.
 */
public class InflectionFramesTool {
	public static void main(String[] args) throws IOException {
		new InflectionFramesTool().inflectionFrames();
	}

	/* */
	@SuppressWarnings( { "unused" })
	public void inflectionFrames() throws IOException {
		final Dictionary pl = Dictionary.getForLanguage("pl");
		final DictionaryLookup dict = new DictionaryLookup(pl);

		final CharsetDecoder decoder = Charset.forName(pl.metadata.encoding)
		        .newDecoder().onMalformedInput(CodingErrorAction.REPORT)
		        .onUnmappableCharacter(CodingErrorAction.REPORT);

		final HashMap> forms = 
			new HashMap>();

		ByteBuffer stemBuffer = ByteBuffer.allocate(0);
		ByteBuffer inflBuffer = ByteBuffer.allocate(0);
		ByteBuffer stemDecoded = ByteBuffer.allocate(0);

		int limit = Integer.MAX_VALUE;

		final Iterator i = new DictionaryIterator(pl, decoder, false);
		while (i.hasNext() && limit-- > 0) {
			final WordData wd = i.next();

			final CharSequence inflected = wd.getWord();
			final CharSequence stemEncoded = wd.getStem();
			final CharSequence tag = wd.getTag();
			if (tag == null)
				continue;

			inflBuffer.clear();
			inflBuffer = wd.getWordBytes(inflBuffer);

			stemBuffer.clear();
			stemBuffer = wd.getStemBytes(stemBuffer);

			stemDecoded = DictionaryLookup.decodeStem(stemDecoded, stemBuffer
			        .array(), stemBuffer.remaining(), inflBuffer, pl.metadata);
			stemDecoded.flip();

			final String stem = decoder.decode(stemDecoded).toString();
			final String form = tag.toString().intern();

			ArrayList frames = forms.get(stem);
			if (frames == null) {
				forms.put(stem, frames = new ArrayList());
			}

			if (!frames.contains(form)) {
				frames.add(form);
			}
		}

		// Sort the forms so that we get a unique key. Then iteratively add them
		// to another hash (by form this time).
		final HashMap> frames = 
			new HashMap>();

		StringBuilder key = new StringBuilder();
		for (Map.Entry> e : forms.entrySet()) {
			Collections.sort(e.getValue());

			key.setLength(0);
			for (String s : e.getValue())
				key.append(s).append(" ");

			final String k = key.toString();
			ArrayList words = frames.get(k);
			if (words == null) {
				frames.put(k, words = new ArrayList());
			}
			words.add(e.getKey());

			e.setValue(null);
		}

		// Print inflection frames.
		ArrayList>> entries = 
			new ArrayList>>();

		entries.addAll(frames.entrySet());
		Collections.sort(entries,
		        new Comparator>>() {
			        public int compare(Entry> o1,
			                Entry> o2) {
				        return o2.getValue().size() - o1.getValue().size();
			        }
		        });

		for (Map.Entry> e : entries) {
			System.out.println(String.format("%6d   %s %s",
			        e.getValue().size(), e.getKey(), e.getValue()));
		}

		System.out.println("Total frames: " + frames.size());
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy