lv.semti.morphology.lexicon.Paradigm Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of morphology Show documentation
Latvian morphological analysis library
There is a newer version: 2.5.9
/*******************************************************************************
 * Copyright 2008, 2009, 2014 Institute of Mathematics and Computer Science, University of Latvia
 * Author: Pēteris Paikens
 * 
 *     This program is free software: you can redistribute it and/or modify
 *     it under the terms of the GNU General Public License as published by
 *     the Free Software Foundation, either version 3 of the License, or
 *     (at your option) any later version.
 * 
 *     This program is distributed in the hope that it will be useful,
 *     but WITHOUT ANY WARRANTY; without even the implied warranty of
 *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *     GNU General Public License for more details.
 * 
 *     You should have received a copy of the GNU General Public License
 *     along with this program.  If not, see .
 *******************************************************************************/
package lv.semti.morphology.lexicon;

import java.io.IOException;
import java.io.Writer;
import java.util.HashMap;
import java.util.ArrayList;

import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import lv.semti.morphology.attributes.*;

public class Paradigm extends AttributeValues {
	private Lexicon lexicon;
	private int id = 0;  // numurs pēc kārtas - ID
	public String name = ""; // vārdiskais ID
	private ArrayList < HashMap < String, ArrayList > > lexemesByStem
		= new ArrayList >>();
		// 1-3 hashmapi, kuros pēc saknes var atrast tai atbilstošās leksēmas
		// vajadzētu to (un to apstaigājošās funkcijas) iznest kā klasi)
	public HashMap  lexemesByID = new HashMap (); //FIXME - nevajag iisti buut public, vajag tikai read-only iterēt
	public ArrayList  lexemes = new ArrayList ();  //FIXME - nevajag iisti buut public, vajag tikai read-only iterēt
	public ArrayList  endings = new ArrayList ();  //FIXME - nevajag iisti buut public, vajag tikai read-only iterēt
	private Ending lemmaEnding = null;  // kura no galotnēm uzskatāma par pamatformu
	private int stems = 1;      // cik saknes ir šai vārdgrupai (tipiski 1; darbībasvārdiem 3)
	private String allowedGuessEndings = "";
	public String description = "";

	public Paradigm (Lexicon lexicon) {
		this.lexicon = lexicon;
		lexemesByStem.add(new HashMap>());
	}

	@Override
	public void toXML (Writer straume) throws IOException {
		straume.write("\n");
		super.toXML(straume); // īpašības UzXML

		for (Ending ending : endings)
			ending.toXML(straume);
		for (Lexeme leksēma : lexemes)
			leksēma.toXML(straume);

		straume.write("\n");
	}

	/* saglabaa apakshleksikonaa tikai taas lekseemas, kuraam source sakriit ar noraadiito */
	public void toXML_sub(Writer straume, String source) throws IOException {
		straume.write("\n");
		super.toXML(straume); // īpašības UzXML

		for (Lexeme leksēma : lexemes) {
			if (leksēma.isMatchingStrong(AttributeNames.i_Source, source))
				leksēma.toXML(straume);			
		}

		straume.write("\n");		
	}

	public Paradigm(Lexicon lexicon, Node node) {
		super(node);
		if (!node.getNodeName().equalsIgnoreCase("Paradigm")) throw new Error("Node '" + node.getNodeName() + "' but Paradigm expected.");
		this.lexicon = lexicon;

		Node n = node.getAttributes().getNamedItem("Stems");
		if (n != null)
			this.setStems(Integer.parseInt(n.getTextContent()));

		n = node.getAttributes().getNamedItem("ID");
		if (n != null)
			this.setID(Integer.parseInt(n.getTextContent()));

		n = node.getAttributes().getNamedItem("Name");
		if (n != null)
			this.name = n.getTextContent();

		n = node.getAttributes().getNamedItem("Description");
		if (n != null)
			this.setDescription(n.getTextContent());

		NodeList nodes = node.getChildNodes();
		for (int i = 0; i < nodes.getLength(); i++) {
			if (nodes.item(i).getNodeName().equals("Ending"))
				addEnding(new Ending(this, nodes.item(i)));
		}

		n = node.getAttributes().getNamedItem("LemmaEnding");
		if (n != null)
			this.setLemmaEnding(Integer.parseInt(n.getTextContent()));
		
		n = node.getAttributes().getNamedItem("AllowedGuessEndings");
		if (n != null)
			this.allowedGuessEndings = n.getTextContent();

		for (int i = 0; i < nodes.getLength(); i++) {
			if (nodes.item(i).getNodeName().equals("Lexeme")) {
			    Lexeme l = new Lexeme(this, nodes.item(i));
				addLexeme(l);
			}
		}
	}

	/***
	 * Takes an XML-sublexicon node of type 'Paradigm', and takes the Lexeme elements from there
	 * @param node
	 */
	public void addLexemesFromXML(Node node) {
		if (!node.getNodeName().equalsIgnoreCase("Paradigm")) throw new Error("Node '" + node.getNodeName() + "' but Paradigm expected.");

		NodeList nodes = node.getChildNodes();
		for (int i = 0; i < nodes.getLength(); i++) {
			if (nodes.item(i).getNodeName().equals("Lexeme")) {
                Lexeme l = new Lexeme(this, nodes.item(i));
                if (l != null) {
                    String frequency = l.getValue("Skaits"); // FIXME - hardcoded value
                    if (frequency == null || Integer.parseInt(frequency) > Lexicon.proper_name_frequency_floor)
                        addLexeme(l);
                }
            }
		}
	}
	
	@Override
	@SuppressWarnings("unchecked")
	public Object clone() {
		// uztaisa paradigmas kopiju, kurai var mainīt īpašības, nenočakarējot sākotnējo leksēmu DB.
		Paradigm kopija;
		try {
			kopija = (Paradigm) super.clone();
			kopija.lexemesByStem = (ArrayList >>)lexemesByStem.clone();
			kopija.lexemes = (ArrayList )lexemes.clone();
			kopija.endings = (ArrayList )endings.clone();
			kopija.id = id;
	        return kopija;
		} catch (CloneNotSupportedException e) {
			e.printStackTrace();
			return null;
		}
	}

	public int numberOfLexemes () {
		return lexemes.size();
	}

	public Ending getLemmaEnding(){
		return lemmaEnding;
		// Lieto tam, lai jaunu leksēmu ģenerējot, aizpildītu lemma lauku
	}

	public int numberOfEndings () {
		return endings.size();
	}

	public void addLexeme (Lexeme lexeme) {
		lexeme.setParadigm(this);

		if (lexeme.getID() == 0) {
			lexeme.setID( lexicon.newLexemeID());
		} else {
// TODO - principā jau šī pārbaude ir OK
//			Lexeme duplicatetest = lexemesByID.get(lexeme.getID());
//			if (duplicatetest != null) {
//				System.err.println("Lexemes with duplicate IDs:");
//				duplicatetest.describe(new PrintWriter(System.err));
//				lexeme.describe(new PrintWriter(System.err));
//			}
		}
		lexemesByID.put(lexeme.getID(), lexeme);

		lexeme.setStemCount(stems);

		for (int i = 0; i < stems; i++) {
			// pieliekam leksēmas 1-3 saknes vārdgrupas masīvos
			ArrayList esošās = lexemesByStem.get(i).get(lexeme.getStem(i));
			if (esošās == null) {
				esošās = new ArrayList();
				lexemesByStem.get(i).put(lexeme.getStem(i), esošās);
			}
			esošās.add(lexeme);
		}
		lexemes.add(lexeme);

		if (lexeme.getValue(AttributeNames.i_Lemma) == null && getLemmaEnding() != null)
			lexeme.addAttribute(AttributeNames.i_Lemma, lexeme.getStem(getLemmaEnding().stemID-1) + getLemmaEnding().getEnding());

		if (this.isMatchingStrong(AttributeNames.i_ParadigmProperties, AttributeNames.v_HardcodedWordforms)) { // Hardcoded un vietniekvārdu paradigma
			lexeme.describe();
			this.lexicon.hardcodedForms.put(lexeme.getID(), lexeme);
		}

		String pamatforma = lexeme.getValue(AttributeNames.i_Lemma);
        if (pamatforma.matches(".*[ ./'\\d]+.*") && pamatforma.length() > 1 && !pamatforma.matches("\\.+")
//                || (this.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Punctuation) && pamatforma.length() > 1)
            ) {
		    this.lexicon.automats.addException(pamatforma);
        }
	}

	public void removeLexeme (Lexeme lexeme) {
		// ja nebūs tādas leksēmas, tad nekas arī nenotiks
		lexemes.remove(lexeme);
		lexemesByID.remove(lexeme.getID());
		for (int i = 0; i < stems; i++) {
			ArrayList matchingstems = lexemesByStem.get(i).get(lexeme.getStem(i));
			if (matchingstems != null) {
				matchingstems.remove(lexeme);
				if (matchingstems.size()==0) lexemesByStem.get(i).remove(lexeme.getStem(i));
			}
		}
		this.lexicon.hardcodedForms.remove(lexeme.getID(), lexeme);
	}

	public void addEnding (Ending ending) {
		if (ending.getID() == 0) {
			ending.setID( lexicon.maxEndingID() + 1 );
		}
		ending.setParadigm(this);
		endings.add(ending);
		lexicon.invalidateAllEndings();
	}

	public void removeEnding (Ending ending) {
		// ja nebūs tādas galotnes, tad nekas arī nenotiks
		endings.remove(ending);
		lexicon.invalidateAllEndings();
	}

	public int getID() {
		return id;
	}

	public void setID(int id) {
		this.id = id;
	}

	public void setLemmaEnding(int lemmaEnding) {
		this.lemmaEnding = endingByNr(lemmaEnding);
		if (this.lemmaEnding == null)
			System.err.printf("Error when loading paradigm %d - cannot find lemma ending %d\n", this.id, lemmaEnding);
	}

	public int getStems() {
		return stems;
	}

	public void setStems(int stems) {
		this.stems = stems;

		while (lexemesByStem.size() > stems) lexemesByStem.remove(lexemesByStem.size()-1);
		while (lexemesByStem.size() < stems) lexemesByStem.add(new HashMap>());

		//FIXME - tā, a ko ar leksēmu sakņu skaitiem ta darīt tagad??
	}

	public String getName() {
		return name;
	}

	public void setDescription(String description) {
		this.description = description;
	}

	public Ending endingByNr(int endingNr) {
		for (Ending ending : endings)
			if (ending.getID() == endingNr) return ending;

		return null;
	}

	public ArrayList>> getLexemesByStem() {
		//TODO - jāprotektē
		return lexemesByStem;
	}

	// Verifies if this stem is a valid stem for this paradigm, based on the last letter(s?) of that stem 
	public boolean allowedGuess(String stem) {
		if (allowedGuessEndings.isEmpty()) return true; // FIXME - workaround until all paradigms have this data filled
		if ((allowedGuessEndings.indexOf('!') >= 0) && !this.lexicon.guessAllParadigms) return false;
		if (stem.isEmpty()) return false;

		if (this.id == 12 && stem.endsWith("as")) return true; // FIXME Hardcoded -as inflexible nouns like Lithuanian derived surnames Arvydas etc
		
		char lastchar = stem.charAt(stem.length()-1);
		return (allowedGuessEndings.indexOf(lastchar) >= 0);
	}
}