lv.semti.morphology.lexicon.Lexicon Maven / Gradle / Ivy
                 Go to download
                
        
                    Show more of this group  Show more artifacts with this name
Show all versions of morphology Show documentation
                Show all versions of morphology Show documentation
Latvian morphological analysis library
                
             The newest version!
        
        /*******************************************************************************
 * Copyright 2008, 2009, 2014 Institute of Mathematics and Computer Science, University of Latvia
 * Author: Pēteris Paikens
 * 
 *     This program is free software: you can redistribute it and/or modify
 *     it under the terms of the GNU General Public License as published by
 *     the Free Software Foundation, either version 3 of the License, or
 *     (at your option) any later version.
 * 
 *     This program is distributed in the hope that it will be useful,
 *     but WITHOUT ANY WARRANTY; without even the implied warranty of
 *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *     GNU General Public License for more details.
 * 
 *     You should have received a copy of the GNU General Public License
 *     along with this program.  If not, see  .
 *******************************************************************************/
package lv.semti.morphology.lexicon;
import lv.semti.morphology.analyzer.AllEndings;
import lv.semti.morphology.analyzer.Mijas;
import lv.semti.morphology.analyzer.Trie;
import lv.semti.morphology.analyzer.Variants;
import lv.semti.morphology.attributes.AttributeNames;
import java.io.*;
import java.util.*;
import java.util.regex.Pattern;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.Multimap;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
/**
 * Satur leksikona datus - leksēmu sarakstu un to locīšanas informāciju
 * Piedāvā funkcijas tos pārveidot uz/no XML, un papildināt/izdzēst ierakstus.
 *
 * @author Pēteris Paikens
 *
 */
public class Lexicon {
	public final static String DEFAULT_LEXICON_FILE = "Lexicon_v2.xml";
	public static int proper_name_frequency_floor = 2; // When loading proper name lexemes, entries that have a frequency ("Skaits") field will be ignored and not loaded
	protected String filename;
	protected String NEGATION_PREFIX = "ne";
	protected String DEBITIVE_PREFIX = "jā";
	protected String SUPERLATIVE_PREFIX = "vis";
	public String getRevision() {
		return revision;
	}
	private String revision;
	private String licence;
	
	public ArrayList paradigms; //TODO - nebūtu jābūt publiskam, vajag tikai read-only iterēt
	private AllEndings allEndings = null;
	protected ArrayList prefixes;
	private ArrayList corpusFileNames = new ArrayList();
	// Vārdu lielo/mazo burtu nošķiršana
	protected static Pattern p_firstcap = Pattern.compile("\\p{Lu}.*");
	protected static Pattern p_allcaps = Pattern.compile("(\\p{Lu})*");
	protected static Pattern p_doublesurname = Pattern.compile("\\p{Lu}.+-\\p{Lu}.+");
	protected Multimap hardcodedForms = ArrayListMultimap.create();
	public Trie automats = new Trie();
	public boolean guessAllParadigms = false; // Attempt guessing words even in paradigms where AllowedGuessEndings are marked with !
	/**
	 * Creates a lexicon object from the default location in JAR resources
	 *
	 * @throws Exception	parsing errors
	 */
	public Lexicon() throws Exception {
		InputStream stream = getClass().getClassLoader().getResourceAsStream(DEFAULT_LEXICON_FILE);
		if (stream != null) {
			init(stream, true);
		}
		else throw new IOException("Can't find '" + DEFAULT_LEXICON_FILE + "'.");
	}
	public Lexicon(boolean useAuxiliaryLexicons) throws Exception {
		InputStream stream = getClass().getClassLoader().getResourceAsStream(DEFAULT_LEXICON_FILE);
		if (stream != null) {
			init(stream, useAuxiliaryLexicons);
		}
		else throw new IOException("Can't find '" + DEFAULT_LEXICON_FILE + "'.");
	}
	
	/**
	 * Izveido leksikona objektu no XML faila
	 *
	 * @param filename	faila vārds, kurā meklēt leksikonu
	 * @throws Exception	parsēšanas kļūdas
	 */
	public Lexicon(String filename) throws Exception {
		init(filename, true);
	}
	
	/**
	 * Izveido leksikona objektu no XML faila
	 *
	 * @param lexiconFileName	faila vārds, kurā meklēt leksikonu
	 * @param useAuxiliaryLexicons vai lietot papildvārdnīces 
	 * @throws Exception	parsēšanas kļūdas
	 */
	public Lexicon(String lexiconFileName, boolean useAuxiliaryLexicons) throws Exception {
		init(lexiconFileName, useAuxiliaryLexicons);
	}
	public Lexicon(String lexiconFileName, ArrayList blacklist) throws Exception{
		init(lexiconFileName, blacklist);
	}
	/**
	 * @return null, if the lexicon is read from an input stream.
	 */
	public String getFilename() {
		return filename;
	}
	protected AllEndings getAllEndings(){
		if (allEndings == null) {
			ArrayList endings = new ArrayList();
			for (Paradigm paradigm : paradigms) {
				if (!paradigm.isMatchingStrong(AttributeNames.i_ParadigmProperties, AttributeNames.v_OnlyHardcodedWordforms))
					endings.addAll(paradigm.endings);
			}
			allEndings = new AllEndings(endings);
		}
		
		return allEndings;
	}
	
	void invalidateAllEndings() {
		allEndings = null;
	}
	
	private void init(String fileName, boolean useAuxiliaryLexicons) throws Exception {
		System.err.println("Loading " + fileName);
		this.filename = fileName;
        Document doc;
        DocumentBuilder docBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
        InputStream stream = getClass().getClassLoader().getResourceAsStream(fileName);
        if (stream != null) {
            doc = docBuilder.parse(stream);
        } else doc = docBuilder.parse(new File(fileName));
		init_main(doc, new File(fileName).getParent(), useAuxiliaryLexicons);
	}
	private void init(String fileName, ArrayList blacklist) throws Exception {
		System.err.println("Loading " + fileName);
		this.filename = fileName;
		Document doc;
		DocumentBuilder docBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
        InputStream stream = getClass().getClassLoader().getResourceAsStream(fileName);
        if (stream != null) {
            doc = docBuilder.parse(stream);
        } else doc = docBuilder.parse(new File(fileName));
		init_main(doc, new File(fileName).getParent(), blacklist);
	}	
	private void init(InputStream plusma, boolean useAuxiliaryLexicons) throws Exception {
		System.err.println("Loading the lexicon from an input stream...");
		
		DocumentBuilder docBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
		Document doc = docBuilder.parse(plusma);
		init_main(doc, null, useAuxiliaryLexicons);
	}
	private void init_main(Document doc, String path, boolean useAuxiliaryLexicons) throws Exception {
		init_main(doc, path, useAuxiliaryLexicons, true, null);
	}
	
	private void init_main(Document doc, String path, ArrayList blacklist) throws Exception{
		init_main(doc, path, true, true, blacklist);		
	}
	
	private void init_main(Document doc, String path, boolean useAuxiliaryLexicons, boolean useCore, ArrayList blacklist) throws Exception {
		Node node = doc.getDocumentElement();
		if (!node.getNodeName().equalsIgnoreCase("Morphology")) throw new Error("Node '" + node.getNodeName() + "' but Morphology expected!");
		Node nodeRevision = node.getAttributes().getNamedItem("revision");
		if (nodeRevision != null)
			revision = nodeRevision.getTextContent();
		
		Node nodeLicence = node.getAttributes().getNamedItem("licence");
		if (nodeLicence != null)
			licence = nodeLicence.getTextContent();
		NodeList nodes = node.getChildNodes();
		prefixes = new ArrayList();
		paradigms = new ArrayList();
		for (int i = 0; i < nodes.getLength(); i++) {
			if (nodes.item(i).getNodeName().equals("Paradigm"))
				addParadigm(new Paradigm(this, nodes.item(i)));
			if (nodes.item(i).getNodeName().equals("Corpus")) {
				Node corpusFileName = nodes.item(i).getAttributes().getNamedItem("FileName");
				Node lexiconType = nodes.item(i).getAttributes().getNamedItem("Type");
				boolean isCore = false;
				if (lexiconType != null) isCore = lexiconType.getTextContent().equalsIgnoreCase("core");
				
				if (corpusFileName != null && (useAuxiliaryLexicons || (isCore && useCore)))
					corpusFileNames.add(corpusFileName.getTextContent());
			}
			if (nodes.item(i).getNodeName().equals("Prefixes")) {
				this.loadPrefixes(nodes.item(i));
			}
		}
		
		for (String filename : corpusFileNames) {
			if (blacklist != null && blacklist.contains(filename)) continue; //FIXME - case sensitivity?
            if (filename.endsWith(".xml")) {
                Document doc2;
                DocumentBuilder docBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
                if (path != null) {
                    String fullname = path + java.io.File.separatorChar + filename;
                    doc2 = docBuilder.parse(new File(fullname));
                } else {
                    doc2 = docBuilder.parse(getClass().getClassLoader().getResourceAsStream(filename));
                }
                load_sublexicon_xml(doc2);
            } else if (filename.endsWith(".json")) {
                if (path != null) {
                    String fullname = path + java.io.File.separatorChar + filename;
                    load_sublexicon_json(new FileInputStream(new File(fullname)));
                } else {
                    load_sublexicon_json(getClass().getClassLoader().getResourceAsStream(filename));
                }
            } else throw new Error(String.format("Unsupported file format for sublexicon '%s'", filename));
		}
		automats.initializeExceptions();
		System.err.println("Lexicon " + (revision != null ? revision : "") + " loaded");
	}
	private void loadPrefixes(Node node) {
		NodeList nodes = node.getChildNodes();
		for (int i = 0; i < nodes.getLength(); i++) {
			if (nodes.item(i).getNodeName().equals("Negation")) {
				this.NEGATION_PREFIX = nodes.item(i).getTextContent();
			}
			if (nodes.item(i).getNodeName().equals("Superlative")) {
				this.SUPERLATIVE_PREFIX = nodes.item(i).getTextContent();
			}
			if (nodes.item(i).getNodeName().equals("Debitive")) {
				this.DEBITIVE_PREFIX = nodes.item(i).getTextContent();
			}
			if (nodes.item(i).getNodeName().equals("VerbPrefix")) {
				this.prefixes.add(nodes.item(i).getTextContent());
			}
		}
	}
	private void load_sublexicon_json(InputStream stream) throws ParseException, UnsupportedEncodingException {
        BufferedReader reader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
        JSONParser parser = new JSONParser();
        String json_row;
        try {
            while ((json_row = reader.readLine()) != null) {
                Lexeme l = new Lexeme((JSONObject) parser.parse(json_row), this);
                if (l.isMatchingStrong(AttributeNames.i_EntryName, "irt:1")
						|| l.isMatchingStrong(AttributeNames.i_EntryName, "irt")
						|| l.isMatchingStrong(AttributeNames.i_EntryName, "art:1")
                        || l.isMatchingStrong(AttributeNames.i_EntryName, "art")) {
                    l.addAttribute(AttributeNames.i_Frequency, AttributeNames.v_Rare);
                }
            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
	}
    private void load_sublexicon_xml(Document doc) throws Exception {
		Node node = doc.getDocumentElement();
		if (!node.getNodeName().equalsIgnoreCase("Lexicon")) throw new Error("Node '" + node.getNodeName() + "' but Lexicon expected!");
	
		NodeList nodes = node.getChildNodes();
		for (int i = 0; i < nodes.getLength(); i++) {
			if (nodes.item(i).getNodeName().equals("Paradigm")) {
				Node n = nodes.item(i).getAttributes().getNamedItem("ID");
				if (n != null) {
					int paradigmID = Integer.parseInt(n.getTextContent());
					Paradigm paradigm = this.paradigmByID(paradigmID);
					if (paradigm != null) paradigm.addLexemesFromXML(nodes.item(i));
					else throw new Exception(String.format("When loading subcorpus, cannot find paradigm %d in main morphology", paradigmID));
				}
			}
		}
	}
	
	/**
	 * Saglabā visus leksikona datus, vārdgrupas un galotnes ieskaitot, XML formātā.
	 * Teksta kodējums tiek likts UTF-8, un pietiek ar XML 1.0, ja XML parseris ir korekts,
	 * lai var būt latviešu burti atribūtos.
	 *
	 * @param failaVārds 	Faila vārds, kurā saglabāt.
	 * @throws IOException
	 */
	public void toXML(String failaVārds) throws IOException {
	//TODO - būtu nevis faila vārds jāņem, bet outputstream.
		System.out.println("Warning! XML saving possibly obsolete after multuple-lexicon changes");
		
		File file = new File(failaVārds);
		File newfile = new File(failaVārds + ".new");
		File backupfile = new File(failaVārds + ".bak");
		
		Writer straume = new BufferedWriter(new OutputStreamWriter(
				new FileOutputStream(newfile), "UTF-8"));
		straume.write("\n");
		straume.write("\n");
		for (Paradigm paradigm : paradigms) {
			paradigm.toXML(straume);
		}
		straume.write(" ");
		straume.close();
		
		// remove old backup file
		if (backupfile.exists())
			backupfile.delete();
		
		// backup existing file
		if (file.exists())
			file.renameTo(backupfile);
		
		newfile.renameTo(file);
	}
	
	
	/**
	 * Saglabā XML formātā apakšleksikona leksēmas - tikai taas, kuraam source sakriit ar noraadiito
	 *
	 * @param failaVārds 	Faila vārds, kurā saglabāt.
	 * @throws IOException
	 */
	public void toXML_sub(String failaVārds, String source) throws IOException {
		File file = new File(failaVārds);
		File newfile = new File(failaVārds + ".new");
		File backupfile = new File(failaVārds + ".bak");
		
		Writer straume = new BufferedWriter(new OutputStreamWriter(
				new FileOutputStream(newfile), "UTF-8"));
		straume.write("\n");
		straume.write("\n");
		for (Paradigm paradigm : paradigms) {
			paradigm.toXML_sub(straume, source);
		}
		straume.write(" ");
		straume.close();
		
		// remove old backup file
		if (backupfile.exists())
			backupfile.delete();
		
		// backup existing file
		if (file.exists())
			file.renameTo(backupfile);
		
		newfile.renameTo(file);
	}
	
	/**
	 * Saglabā visus leksikona datus, vārdgrupas un galotnes ieskaitot, XML formātā.
	 * Teksta kodējums tiek likts UTF-8, un pietiek ar XML 1.0, ja XML parseris ir korekts,
	 * lai var būt latviešu burti atribūtos.
	 *
	 * @param plusma 	Faila vārds, kurā saglabāt.
	 * @throws IOException
	 */
	public void toXML(OutputStream plusma) throws IOException {
		System.out.println("Warning! XML saving possibly obsolete after multuple-lexicon changes");
		
		Writer straume = new BufferedWriter(new OutputStreamWriter(plusma, "UTF-8"));
		straume.write("\n");
		straume.write("\n");
		for (Paradigm paradigm : paradigms) {
			paradigm.toXML(straume);
		}
		straume.write(" ");
		straume.close();
	}
	
	/**
	 * Sameklē leksikonā vārdgrupu ar norādīto numuru.
	 *
	 * @param nr	vārdgrupas numurs.
	 * @return		atrastā vārdgrupa, vai arī null, ja nav atrasts.
	 */
	public Paradigm paradigmByID(int nr) {
		//FIXME - vispār vajadzētu likvidēt to atsauci uz numuriem pēc iespējas.
		Paradigm rezults = null;
		for (Paradigm vārdgrupa : paradigms) {
			if (vārdgrupa.getID() == nr)
				rezults = vārdgrupa;
		}
		return rezults;
	}
	/**
	 * Sameklē leksikonā vārdgrupu ar norādīto numuru.
	 *
	 * @param name	vārdgrupas vārdiskais identifikators
	 * @return		atrastā vārdgrupa, vai arī null, ja nav atrasts.
	 */
	public Paradigm paradigmByName(String name) {
		Paradigm rezults = null;
		for (Paradigm vārdgrupa : paradigms) {
			if (vārdgrupa.getName().equalsIgnoreCase(name))
				rezults = vārdgrupa;
		}
		return rezults;
	}
	/**
	 * Sameklē leksikonā galotni ar norādīto numuru.
	 *
	 * @param nr	galotnes numurs.
	 * @return		atrastā galotne, vai arī null, ja nav atrasts.
	 */
	public Ending endingByID(int nr) {
		return getAllEndings().endingByID(nr);
	}
	/**
	 * Sameklē leksikonā leksēmu ar norādīto numuru.
	 * Ja nu ir vairākas gadījušās ar vienādu numuru, tad atrod vienu no tām.
	 *
	 * @param nr	leksēmas numurs.
	 * @return		atrastā leksēma, vai arī null, ja nav atrasts.
	 */
	public Lexeme lexemeByID(int nr) {
		Lexeme rezults = null;
		for (Paradigm paradigm : paradigms) {
			if (paradigm.lexemesByID.get(nr) != null) {
				// TODO - hmm, nepamanīs ja ir vienādi ID dažādās paradigmās
				rezults = paradigm.lexemesByID.get(nr);
			}
		}
		return rezults;
	}
	/**
	 * Iedod jaunu unikālu leksēmas numuru
	 *
	 * @return	jauns leksēmas numurs
	 */
	private int lexeme_id_counter = 1100000;
	int newLexemeID() {
		lexeme_id_counter += 1;
		while (lexemeByID(lexeme_id_counter) != null)
			lexeme_id_counter += 1; // ja nu ir ielādēts jau kāds virs miljona, tad būs lēni bet vismaz korekti
		return lexeme_id_counter;
	}
	/**
	 * Creates a new lexeme based on a wordform with a known ending ID, and appends it to the lexicon
	 * NB! If the paradigm needs multiple stems (1st conjugation verbs) then only the lemma stem will be added, and the other stems will be empty and need to be filled later
	 *
	 * @param word		full wordform of the word to be added
	 * @param ending	ending object of the word's lemma
	 * @param source	Description field of the lexeme source
	 * @return			The created lexeme or NULL if it couldn't be created
	 */
	public Lexeme createLexeme(String word, Ending ending, String source) {
		String stem;
		try {
			stem = ending.stem(word.toLowerCase());
			int mija = ending.getMija();
			if (mija != 0 && mija != 3) { // don't try to apply comparative and superlative forms
				ArrayList celmi = Mijas.mijuVarianti(stem, mija, word.matches("\\p{Lu}.*"));
				if (celmi.size() == 0) return null; // acīmredzot neder ar miju
				// FIXME ! Nevajadzētu te būt iespējai uz null!
				stem = celmi.get(0).celms;
				// FIXME - vai te ir ok naivi ņemt pirmo variantu ?
			}
		} catch (Exception e) {
            System.err.print(word + Integer.toString(ending.getID()) + source);
			System.err.print(e.getStackTrace());
			return null;
		}
		Lexeme rezults = new Lexeme();
		rezults.setStemCount(ending.getParadigm().getStems());
		rezults.setStem(ending.stemID-1, stem); 
		ending.getParadigm().addLexeme(rezults); // At this moment the actual lemma is generated
		String lemma = rezults.getValue(AttributeNames.i_Lemma);
		lemma = recapitalize(lemma, word);
		rezults.addAttribute(AttributeNames.i_Lemma, lemma);
		rezults.addAttribute(AttributeNames.i_Source, source);
		clearCache();
		return rezults;
	}
	
	/**
	 * Creates a new lexeme based on a wordform with a known paradigm ID, and appends it to the lexicon
	 * NB! If the paradigm needs multiple stems (1st conjugation verbs) then only the lemma stem will be added, and the other stems will be empty and need to be filled later
	 *
	 * @param word		full wordform of the word to be added
	 * @param paradigmID	ID of the paradigm
	 * @param source	Description field of the lexeme source
	 * @return			The created lexeme or NULL if it couldn't be created
	 */
	public Lexeme createLexemeFromParadigm(String word, int paradigmID, String source) throws Exception{
		Paradigm p = this.paradigmByID(paradigmID);
		if (p==null)
			throw new Exception(String.format("createLexemeFromParadigm - invalid paradigm id %d passed for lexeme %s", paradigmID, word));
		if (word==null)
			throw new Exception("createLexemeFromParadigm - null lexeme string passed");
		if (p.getLemmaEnding()==null)
			throw new Exception(String.format("createLexemeFromParadigm - null lemma ending at paradigm id %d for lexeme %s", paradigmID, word));
		
		if (word.endsWith(p.getLemmaEnding().getEnding())) // If we've been passed the appropriate lemma already 
			return this.createLexeme(word, p.getLemmaEnding(), source);
		
		// if there's some other wordform, then we'll try to find it. 
		// TODO - this assumes that the lemma will be the same regardless of which wordform we choose. Maybe that's not true for some stemchanges.
		for (Ending e : p.endings) {
			if (e.isMatchingStrong(AttributeNames.i_Case, AttributeNames.v_Vocative))
				continue;
			if (word.endsWith(e.getEnding()))
				return this.createLexeme(word, e, source);
		}
		
		throw new Exception(String.format("createLexemeFromParadigm - couldn't create lexeme %s with paradigm %d", word, paradigmID));
	}
	/**
	 * Pieliek norādīto vārdgrupu leksikonam.
	 * Ja vārdgrupai numurs ir 0, tad arī uzģenerē tai jaunu numuru.
	 *
	 * @param paradigm	vārdgrupa, kuru pielikt
	 */
	public void addParadigm (Paradigm paradigm) {
		if (paradigm.getID() == 0) {
			int maxnr = 0;
			for (Paradigm vārdgrupa : paradigms) {
				if (vārdgrupa.getID() > maxnr) maxnr = vārdgrupa.getID();
			}
			paradigm.setID (maxnr + 1);
		}
		paradigms.add(paradigm);
	}
	/**
	 * Izņem norādīto vārdgrupu no leksikona datiem.
	 *
	 * @param paradigm
	 */
	public void removeParadigm (Paradigm paradigm) {
		paradigms.remove(paradigm);
	}
	/**
	 * Atrod leksikonā lielāko šobrīd esošo galotnes numuru
	 *
	 * @return	lielākais galotnes numurs, vai 0, ja nav nevienas leksēmas.
	 */
    int maxEndingID() {
		int result = 0;
		for (Paradigm paradigm : paradigms) {
			for (Ending ending : paradigm.endings)
				if (ending.getID() > result)
					result = ending.getID();
		}
		return result;
	}
	
	/**
	 * Clears cache, if any.
	 */
	public void clearCache () {}
	public static String recapitalize(String word, String originalWord) {
		if (p_firstcap.matcher(originalWord).matches())
			word = word.substring(0, 1).toUpperCase() + word.substring(1,word.length());
		if (p_allcaps.matcher(originalWord).matches())
			word = word.toUpperCase();
		if (p_doublesurname.matcher(originalWord).matches()) {
			int otrslielais = word.indexOf("-")+1;
			if (otrslielais > -1) // nočekojam gadījumam ja nu originalWord'ā ir '-' bet lemmā nav
				word = word.substring(0, otrslielais) + word.substring(otrslielais, otrslielais+1).toUpperCase() + word.substring(otrslielais+1,word.length());
		}
		return word;
	}
}
                  © 2015 - 2025 Weber Informatics LLC | Privacy Policy