lv.semti.morphology.analyzer.Analyzer Maven / Gradle / Ivy
/*******************************************************************************
* Copyright 2008, 2009, 2014 Institute of Mathematics and Computer Science, University of Latvia
* Author: Pēteris Paikens
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*******************************************************************************/
package lv.semti.morphology.analyzer;
import java.io.PrintWriter;
import java.util.*;
import java.util.regex.Pattern;
import lv.semti.morphology.attributes.AttributeNames;
import lv.semti.morphology.attributes.AttributeValues;
import lv.semti.morphology.corpus.ParadigmFrequencyComparator;
import lv.semti.morphology.lexicon.*;
public class Analyzer extends Lexicon {
public boolean enablePrefixes = true;
public boolean meklētsalikteņus = false;
public boolean enableGuessing = false;
public boolean enableDiminutive = true;
public boolean enableDerivedNouns = true; // FIXME - šim vajag saprast korektu terminu
public boolean enableVocative = false;
public boolean guessNouns = true;
public boolean guessVerbs = true;
public boolean guessParticiples = true;
public boolean guessAdjectives = true;
public boolean enableAllGuesses = false;
public boolean guessInflexibleNouns = true;
public boolean removeRareWords = true;
public boolean removeRegionalWords = true; // Ignore regiona/dialect forms as they tend to produce unexpected overlap with forms of other common words
private Pattern p_number = Pattern.compile("[\\d., ]*[\\d+⁰¹²³⁴⁵⁶⁷⁸⁹₀₁₂₃₄₅₆₇₈₉]([.,][-‐‑‒–—―])?");
private Pattern p_ordinal = Pattern.compile("\\d+\\.");
private Pattern p_fractional = Pattern.compile("\\d+[\\\\/]\\d+");
private Pattern p_abbrev = Pattern.compile("\\w+\\.");
private Pattern p_abbrev_caps = Pattern.compile("\\p{Lu}+\\."); // abbreviation in all caps
private Pattern p_acronym = Pattern.compile("(\\p{Lu}){2,5}"); // all caps, repeated 2-5 times
private Pattern p_letter = Pattern.compile("(\\p{L})"); // an isolated letter
private Pattern p_url = Pattern.compile("((ht|f)tps?://)?[.\\w-]+\\.(lv|com|org|gov)(/[\\w\\d-@:?=&%.]*)?");
private Cache wordCache = new Cache();
/**
* Construct the morphological analyzer object by loading the lexicon from either the default location, a specified file name or an inputstream.
* @throws Exception
*/
public Analyzer () throws Exception {
super();
}
public Analyzer (boolean useAuxiliaryLexicons) throws Exception {
super(useAuxiliaryLexicons);
}
public Analyzer (String lexiconFileName) throws Exception {
super(lexiconFileName);
}
/**
* Loads the analyzer lexicon from the specified file
* @param lexiconFileName - main lexicon file name
* @param useAuxiliaryLexicons should secondary lexicon files be included in addition to the core lexicons
*/
public Analyzer(String lexiconFileName, boolean useAuxiliaryLexicons) throws Exception{
super(lexiconFileName, useAuxiliaryLexicons);
}
/**
* Loads the analyzer lexicon from the specified file, but excludes a blacklist of sub-lexicons when doing so
* @param lexiconFileName filename of the core lexicon
* @param blacklist list of sub-lexicon file names to skip from loading
*/
public Analyzer(String lexiconFileName, ArrayList blacklist) throws Exception{
super(lexiconFileName, blacklist);
}
/* TODO - salikteņu minēšana jāuzaisa
private boolean DerSalikteņaSākumam(Ending ending) {
if (ending.getParadigm().isMatchingStrong(AttributeNames.i_PartOfSpeech,AttributeNames.v_Noun))
return ending.isMatchingStrong(AttributeNames.i_Case,AttributeNames.v_Genitive);
return false;
} */
public void defaultSettings(){
enablePrefixes = true;
meklētsalikteņus = false;
enableGuessing = false;
enableDiminutive = true;
enableDerivedNouns = true;
enableVocative = false;
guessNouns = true;
guessVerbs = true;
guessParticiples = true;
guessAdjectives = true;
guessAllParadigms = false;
enableAllGuesses = false;
guessInflexibleNouns = true;
removeRareWords = true;
removeRegionalWords = true;
}
public void describe(PrintWriter pipe) {
pipe.format("enableGuessing:\t%b\n", enableGuessing);
pipe.format("enablePrefixes:\t%b\n", enablePrefixes);
pipe.format("enableDiminutive:\t%b\n", enableDiminutive);
pipe.format("enableVocative:\t%b\n", enableVocative);
pipe.format("enableAllGuesses:\t%b\n", enableAllGuesses);
pipe.format("meklētsalikteņus:\t%b\n", meklētsalikteņus);
pipe.format("guessNouns:\t\t%b\n", guessNouns);
pipe.format("guessVerbs:\t\t%b\n", guessVerbs);
pipe.format("guessParticiples:\t%b\n", guessParticiples);
pipe.format("guessAdjectives:\t%b\n", guessAdjectives);
pipe.format("guessInflexibleNouns:\t%b\n", guessInflexibleNouns);
pipe.format("guessAllParadigms:\t%b\n", guessAllParadigms);
pipe.flush();
}
/**
* Performs a morphological analysis of an arbitrary token according to the configuration parameters set in this
* Analyzer object.
* This method handles the caching and capitalization, and delegates the actual analysis to analyzeLowercase.
* @param word - the token string to be analyzed
* @return a Word object containing the possible analysis options
*/
public Word analyze(String word) {
word = word.trim();
Word cacheWord = wordCache.get(word);
if (cacheWord != null) return (Word) cacheWord.clone();
Word rezults = new Word(word);
if (!word.equals(word.toLowerCase().trim())) {
String lettercase = AttributeNames.v_Lowercase;
if (p_firstcap.matcher(word).matches()) lettercase = AttributeNames.v_FirstUpper;
if (p_allcaps.matcher(word).matches()) lettercase = AttributeNames.v_AllUpper;
Word lowercase = analyzeLowercase(word.toLowerCase().trim(), word);
for (Wordform vārdforma : lowercase.wordforms) {
vārdforma.setToken(word.trim());
vārdforma.addAttribute(AttributeNames.i_CapitalLetters, lettercase);
rezults.addWordform(vārdforma);
}
} else {
rezults = analyzeLowercase(word, word);
}
wordCache.put(word, (Word) rezults.clone());
return rezults;
}
/**
* Implements the actual core morphological analysis algorithm
* @param word - the lowercase form of the word
* @param originalWord - the word with the original capitalization
* @return a Word object containing the possible analysis options
*/
private Word analyzeLowercase(String word, String originalWord) {
Word result = new Word(word);
for (Ending ending : getAllEndings().matchedEndings(word)) {
String stemBezMijas;
try {
stemBezMijas = ending.stem(word);
} catch (Ending.WrongEndingException e) {
throw new Error(e); // Shouldn't ever happen - matchedEndings should ensure that word contains that ending.
}
int stemChange = ending.getMija();
boolean properName = p_firstcap.matcher(originalWord).matches();
ArrayList celmi = Mijas.mijuVarianti(stemBezMijas, stemChange, properName);
for (Variants celms : celmi) {
ArrayList lexemes = ending.getEndingLexemes(celms.celms);
boolean foundSomethingHere = false;
if (lexemes != null)
for (Lexeme lexeme : lexemes) {
String trešāSakne = stemBezMijas;
if (lexeme.getParadigm().getStems() == 3) {
trešāSakne = lexeme.getStem(2);
}
if (!Mijas.atpakaļlocīšanasVerifikācija(celms, stemBezMijas, stemChange, trešāSakne, properName))
continue;
Wordform variants = new Wordform(word, lexeme, ending, originalWord);
variants.addAttributes(celms);
variants.addAttribute(AttributeNames.i_Guess, AttributeNames.v_NoGuess);
if (variants.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Abbreviation) && p_allcaps.matcher(originalWord).matches())
variants.addAttribute(AttributeNames.i_Lemma, variants.getValue(AttributeNames.i_Lemma).toUpperCase());
if (this.isAcceptable(variants)) { // izmetam tos variantus, kas nav īsti pieļaujami - vienskaitliniekus daudzskaitlī, vokatīvus ja tos negrib
result.addWordform(variants);
foundSomethingHere = true;
}
}
if (!foundSomethingHere && enableDiminutive)
guessDeminutive(word, result, ending, celms, originalWord);
if (!foundSomethingHere && enableDerivedNouns)
guessDerivedNoun(word, result, ending, celms, originalWord);
}
}
if (result.isRecognized() && (removeRareWords || removeRegionalWords)) {
boolean hasNonrareOption = false;
for (Wordform wf : result.wordforms) {
if (removeRareWords && (wf.isMatchingStrong(AttributeNames.i_Frequency, AttributeNames.v_Rare) ||
wf.isMatchingStrong(AttributeNames.i_Usage, AttributeNames.v_Outdated)))
continue; // rare and removed
if (removeRegionalWords && (wf.isMatchingStrong(AttributeNames.i_Usage, AttributeNames.v_Regional) ||
wf.isMatchingStrong(AttributeNames.i_Usage, AttributeNames.v_RegionalOutdated)))
continue; // regional and removed
hasNonrareOption = true;
}
List to_remove = new LinkedList();
for (Wordform wf : result.wordforms) {
if (removeRareWords && hasNonrareOption && (
wf.isMatchingStrong(AttributeNames.i_Frequency, AttributeNames.v_Rare) ||
wf.isMatchingStrong(AttributeNames.i_Usage, AttributeNames.v_Outdated)
))
to_remove.add(wf); // we remove rare words only if there's a non-rare option remaining, i.e. only in overlap
if (removeRegionalWords && (wf.isMatchingStrong(AttributeNames.i_Usage, AttributeNames.v_Regional) ||
wf.isMatchingStrong(AttributeNames.i_Usage, AttributeNames.v_RegionalOutdated)))
to_remove.add(wf); // we remove regional words whenever the flag is set, even without overlap
}
result.wordforms.removeAll(to_remove);
}
if (!result.isRecognized()) { //Hardcoded izņēmumi (ar regex) kas atpazīst ciparus, kārtas skaitļus utml
Ending HARDCODED_ENDING = this.paradigmByName("hardcoded").getLemmaEnding();
if (HARDCODED_ENDING == null) {
System.err.println("Hardcoded ending not found");
return result;
}
if (p_number.matcher(word).matches()) {
Wordform wf = new Wordform(word, word, HARDCODED_ENDING, AttributeNames.v_Residual);
wf.addAttribute(AttributeNames.i_ResidualType, AttributeNames.v_Number);
result.addWordform(wf);
return result;
}
if (p_fractional.matcher(word).matches()) {
Wordform wf = new Wordform(word, word, HARDCODED_ENDING, AttributeNames.v_Residual);
wf.addAttribute(AttributeNames.i_ResidualType, AttributeNames.v_Number);
result.addWordform(wf);
return result;
}
if (p_ordinal.matcher(word).matches()) {
Wordform wf = new Wordform(word, word, HARDCODED_ENDING, AttributeNames.v_Residual);
wf.addAttribute(AttributeNames.i_ResidualType, AttributeNames.v_Ordinal);
result.addWordform(wf);
return result;
}
if (p_abbrev_caps.matcher(originalWord).matches() ||
p_abbrev.matcher(word).matches() ||
(enableGuessing && p_acronym.matcher(originalWord).matches())
) {
Ending ABBREV_ENDING = this.paradigmByName("abbr").getLemmaEnding();
if (ABBREV_ENDING == null) {
System.err.println("Abbreviation ending not found");
return result;
}
result.addWordform(new Wordform(word, word, ABBREV_ENDING, AttributeNames.v_Abbreviation));
return result;
}
if (p_letter.matcher(word).matches()) {
Wordform wf = new Wordform(word, word, HARDCODED_ENDING, AttributeNames.v_Residual);
result.addWordform(wf);
return result;
}
if (p_url.matcher(word).matches()) {
Wordform wf = new Wordform(word, word, HARDCODED_ENDING, AttributeNames.v_Residual);
wf.addAttribute(AttributeNames.i_ResidualType, AttributeNames.v_URI);
result.addWordform(wf);
return result;
}
}
if (enablePrefixes) {
if (!result.isRecognized()
|| (word.startsWith(this.NEGATION_PREFIX) && !result.hasAttribute(AttributeNames.i_PartOfSpeech, AttributeNames.v_Verb))) {
for (Wordform wf : guessByPrefix(word).wordforms) {
result.addWordform(wf);
}
}
// if (result.hasAttribute(AttributeNames.i_PartOfSpeech, AttributeNames.v_Verb))
}
/*
if (!rezultāts.isRecognized() && meklētsalikteņus )
for (Ending ending : allEndings())
if (DerSalikteņaSākumam(ending)) {
for (ArrayList pirmiecelmi : galotne.getVārdgrupa().leksēmaspēcVārda.get(galotne.saknesNr-1).values()) {
//FIXME - salikteņu meklēšana nav te ielikta
}
} */
if (!result.isRecognized() && enableGuessing )
result = guessByEnding(word, originalWord);
if (enableGuessing) {
boolean all_deminutives = true;
// We want to do full guessing also if there was a deminutive found - otherwise masc sg gen "Rāviņa" gets interpreted as deminutive of "rāva"
for (Wordform wf : result.wordforms) {
if (!wf.isMatchingStrong(AttributeNames.i_Guess, AttributeNames.v_Deminutive))
all_deminutives = false;
}
if (!result.isRecognized() || all_deminutives) result = guessByEnding(word, originalWord);
}
/*for (Wordform variants : rezultāts.wordforms) {
variants.addAttribute(AttributeNames.i_Tag, MarkupConverter.toKamolsMarkup(variants));
if (variants.lexeme != null) {
String locījumuDemo = "";
for (Wordform locījums : generateInflectionsFromParadigm(variants.lexeme)) {
locījumuDemo = locījumuDemo + locījums.getValue(AttributeNames.i_Word) + " " + locījums.getValue(AttributeNames.i_Case) + "\n";
}
variants.pieliktĪpašību("LocījumuDemo", locījumuDemo);
//TODO - kautko jau ar to visu vajag; bet bez īpašas vajadzības tas ir performancehog
}
} */
return result;
}
private void guessDerivedNoun(String word, Word result, Ending ending, Variants celms, String originalWord) {
// -tājs, -ējs, -tāja, -ēja
if (!ending.getParadigm().isMatchingStrong(AttributeNames.i_ParadigmSupportedDerivations, AttributeNames.v_Derivation_tājs_tāja_ējs_ēja))
return;
if (celms.celms.endsWith("tāj")) {
String verb_stem = celms.celms.substring(0,celms.celms.length()-3);
for (int paradigmID : new int[]{16, 17, 45}) {
Paradigm p = this.paradigmByID(paradigmID);
ArrayList lexemes = p.getLexemesByStem().get(0).get(verb_stem);
if (lexemes != null) {
for (Lexeme lexeme : lexemes) {
Wordform variants = new Wordform(word, lexeme, ending);
variants.addAttributes(celms); // TODO - iespējams ka šis ir lieks
variants.addAttribute(AttributeNames.i_Source, "-tājs/-tāja sufiksāls atvasinājums");
variants.addAttribute(AttributeNames.i_SourceLemma, lexeme.getValue(AttributeNames.i_Lemma));
variants.addAttribute(AttributeNames.i_Guess, AttributeNames.v_Deminutive);
String lemma = lexeme.getStem(0) + "tāj" + ending.getLemmaEnding().getEnding();
lemma = recapitalize(lemma, originalWord);
variants.addAttribute(AttributeNames.i_Lemma, lemma);
result.addWordform(variants);
}
}
}
} else if (celms.celms.endsWith("ēj")) {
Paradigm p = this.paradigmByID(15); // verb-1
ArrayList verb_stems = Mijas.mijuVarianti(celms.celms.substring(0,celms.celms.length()-2), 14,false); // 1. konj -is formas mija - manuprāt tas šeit ir pareizais
for (Variants verb_stem : verb_stems) {
ArrayList lexemes = p.getLexemesByStem().get(2).get(verb_stem.celms);
if (lexemes != null) {
for (Lexeme lexeme : lexemes) {
Wordform variants = new Wordform(word, lexeme, ending);
variants.addAttributes(verb_stem); // ?
variants.addAttribute(AttributeNames.i_Source, "-ējs/-ēja sufiksāls atvasinājums");
variants.addAttribute(AttributeNames.i_SourceLemma, lexeme.getValue(AttributeNames.i_Lemma));
variants.addAttribute(AttributeNames.i_Guess, AttributeNames.v_Deminutive);
String lemma = verb_stem.celms + "ēj" + ending.getLemmaEnding().getEnding();
lemma = recapitalize(lemma, originalWord);
variants.addAttribute(AttributeNames.i_Lemma, lemma);
result.addWordform(variants);
}
}
}
}
}
/**
* Attempts to verify if this word can be derived as a possible deminutive form of some noun in lexicon
* @param word
* @param rezultāts
* @param ending
* @param celms
* @param originalWord
*/
private void guessDeminutive(String word, Word rezultāts, Ending ending,
Variants celms, String originalWord) {
if (celms.celms.endsWith("īt") &&
ending.getParadigm().isMatchingStrong(AttributeNames.i_ParadigmSupportedDerivations, AttributeNames.v_Diminutive_īt)) {
ArrayList deminutīvleksēmas = ending.getEndingLexemes(celms.celms.substring(0,celms.celms.length()-2));
if (deminutīvleksēmas != null)
for (Lexeme leksēma : deminutīvleksēmas) {
Wordform variants = new Wordform(word, leksēma, ending);
variants.addAttributes(celms); // TODO - iespējams, ka šis ir lieks
variants.addAttribute(AttributeNames.i_Deminutive, "-īt-");
variants.addAttribute(AttributeNames.i_Source,"pamazināmo formu atvasināšana");
variants.addAttribute(AttributeNames.i_SourceLemma, leksēma.getValue(AttributeNames.i_Lemma));
variants.addAttribute(AttributeNames.i_Guess, AttributeNames.v_Deminutive);
String lemma = leksēma.getStem(0) + "īt" + ending.getLemmaEnding().getEnding();
lemma = recapitalize(lemma, originalWord);
variants.addAttribute(AttributeNames.i_Lemma, lemma);
rezultāts.addWordform(variants);
}
}
if (celms.celms.endsWith("iņ") &&
ending.getParadigm().isMatchingStrong(AttributeNames.i_ParadigmSupportedDerivations, AttributeNames.v_Diminutive_iņ)) {
String pamatforma = celms.celms.substring(0,celms.celms.length()-2);
String pamatforma2 = pamatforma;
if (pamatforma.endsWith("dz")) pamatforma2 = pamatforma.substring(0,pamatforma.length()-2)+"g";
if (pamatforma.endsWith("c")) pamatforma2 = pamatforma.substring(0,pamatforma.length()-1)+"k";
ArrayList deminutīvleksēmas = ending.getEndingLexemes(pamatforma2);
if (ending.getParadigm().getName().equalsIgnoreCase("noun-1b")) { // mainās deklinācija galds -> galdiņš, tāpēc īpaši
deminutīvleksēmas = this.paradigmByName("noun-1a").getLemmaEnding().getEndingLexemes(pamatforma2);
if (pamatforma.endsWith("l")) pamatforma2 = pamatforma.substring(0,pamatforma.length()-1)+"ļ";
ArrayList deminutīvleksēmas2 = ending.getEndingLexemes(pamatforma2);
// bet ir arī ceļš->celiņš, kur paliek 2. deklinācija
if (deminutīvleksēmas == null) deminutīvleksēmas = deminutīvleksēmas2;
else if (deminutīvleksēmas2 != null) deminutīvleksēmas.addAll(deminutīvleksēmas2);
}
if ((pamatforma.endsWith("ļ") && ending.getParadigm().getName().equalsIgnoreCase("noun-1b")) || pamatforma.endsWith("k") || pamatforma.endsWith("g"))
deminutīvleksēmas = null; // nepieļaujam nepareizās mijas 'ceļiņš', 'pīrāgiņš', 'druskiņa'
if (deminutīvleksēmas != null)
for (Lexeme leksēma : deminutīvleksēmas) {
Wordform variants = new Wordform(word, leksēma, ending);
variants.addAttributes(celms); // ?
variants.addAttribute(AttributeNames.i_Deminutive, "-iņ-");
variants.addAttribute(AttributeNames.i_Source,"pamazināmo formu atvasināšana");
variants.addAttribute(AttributeNames.i_SourceLemma, leksēma.getValue(AttributeNames.i_Lemma));
variants.addAttribute(AttributeNames.i_Guess, AttributeNames.v_Deminutive);
String lemma = pamatforma + "iņ" + ending.getLemmaEnding().getEnding();
lemma = recapitalize(lemma, originalWord);
variants.addAttribute(AttributeNames.i_Lemma, lemma);
rezultāts.addWordform(variants);
}
}
}
private boolean isAcceptable(Wordform variants) {
if (!enableVocative && variants.isMatchingStrong(AttributeNames.i_Case,AttributeNames.v_Vocative))
return false;
if (variants.isMatchingStrong(AttributeNames.i_NumberSpecial, AttributeNames.v_PlurareTantum) &&
!(variants.isMatchingWeak(AttributeNames.i_Number, AttributeNames.v_Plural) || variants.isMatchingWeak(AttributeNames.i_Number, AttributeNames.v_NA)))
return false;
if (variants.isMatchingStrong(AttributeNames.i_NumberSpecial, AttributeNames.v_SingulareTantum) &&
!(variants.isMatchingWeak(AttributeNames.i_Number, AttributeNames.v_Singular) || variants.isMatchingWeak(AttributeNames.i_Number, AttributeNames.v_NA)))
return false;
return true;
}
private Word guessByPrefix(String word) {
Word rezultāts = new Word(word);
if (word.contains(" ")) return rezultāts;
boolean vajadzība = false;
if (word.startsWith(this.DEBITIVE_PREFIX)) {
vajadzība = true;
word = word.substring(2);
}
for (String priedēklis : prefixes)
if (word.startsWith(priedēklis) || word.startsWith(this.SUPERLATIVE_PREFIX+priedēklis)) {
String cut_word;
if (word.startsWith(this.SUPERLATIVE_PREFIX)) {
cut_word = this.SUPERLATIVE_PREFIX+word.substring(3+priedēklis.length());
} else {
cut_word = word.substring(priedēklis.length());
}
if (vajadzība) cut_word = this.DEBITIVE_PREFIX + cut_word;
Word bezpriedēkļa = analyzeLowercase(cut_word, cut_word);
for (Wordform variants : bezpriedēkļa.wordforms)
if (variants.getEnding() != null && variants.getEnding().getParadigm() != null && variants.getEnding().getParadigm().getValue(AttributeNames.i_Konjugaacija) != null) { // Tikai no verbiem atvasinātās klases
if (priedēklis.equals(this.NEGATION_PREFIX) && (variants.isMatchingStrong(AttributeNames.i_Mood, AttributeNames.v_DebitiveQuotative)
|| variants.isMatchingStrong(AttributeNames.i_Mood, AttributeNames.v_Debitive))
|| variants.isMatchingStrong(AttributeNames.i_Noliegums, AttributeNames.v_Yes) ) {
continue; // neģenerējam ne- atvasinājumus vajadzības izteiksmei un jau noliegtiem šķirkļiem
}
if (variants.isMatchingStrong(AttributeNames.i_Degree, AttributeNames.v_Superlative) && !word.startsWith(this.SUPERLATIVE_PREFIX) ) {
continue; // neņemam tos, kur ir "vis" uzlicies aiz priedēkļa, kā nevisdomājošākais pavisdomājošākais
}
variants.setToken(word);
variants.addAttribute(AttributeNames.i_Source,"priedēkļu atvasināšana");
variants.addAttribute(AttributeNames.i_Prefix, priedēklis);
if (!priedēklis.equals(this.NEGATION_PREFIX)) {
variants.addAttribute(AttributeNames.i_SourceLemma, variants.getValue(AttributeNames.i_Lemma));
variants.addAttribute(AttributeNames.i_Lemma,priedēklis+variants.getValue(AttributeNames.i_Lemma));
}
variants.addAttribute(AttributeNames.i_Guess, AttributeNames.v_Prefix);
variants.addAttribute(AttributeNames.i_Noliegums, priedēklis.equals(this.NEGATION_PREFIX) ? AttributeNames.v_Yes : AttributeNames.v_No);
rezultāts.wordforms.add(variants);
}
}
return rezultāts;
}
public void reanalyze(Word vārds) {
Word jaunais = analyze(vārds.getToken());
vārds.wordforms.clear();
for (Wordform vārdforma : jaunais.wordforms)
vārds.wordforms.add(vārdforma);
vārds.notifyObservers();
}
// originalWord - original capitalization
public Word guessByEnding(String word, String originalWord) {
Word rezultāts = new Word(word);
for (int i=word.length()-2; i>=0; i--) { // TODO - duma heiristika, kas vērtē tīri pēc galotņu garuma; vajag pēc statistikas
for (Ending ending : getAllEndings().matchedEndings(word))
if (ending.getEnding().length()==i) {
Paradigm p = ending.getParadigm();
if (p.isMatchingStrong(AttributeNames.i_ParadigmProperties, AttributeNames.v_HardcodedWordforms))
continue; // Hardcoded vārdgrupa minēšanai nav aktuāla
String stem;
try {
stem = ending.stem(word);
} catch (Ending.WrongEndingException e) {
throw new Error(e); // Shouldn't ever happen - matchedEndings should ensure that word contains that ending.
}
ArrayList celmi = Mijas.mijuVarianti(stem, ending.getMija(), false); //FIXME - te var būt arī propername... tikai kā tā info līdz šejienei nonāks?
for (Variants celma_variants : celmi) {
String celms = celma_variants.celms;
if (!p.allowedGuess(celms))
if (p_firstcap.matcher(originalWord).matches() && (p.getName().equalsIgnoreCase("noun-4m") ||
p.getName().equalsIgnoreCase("noun-4ma") || p.getName().equalsIgnoreCase("noun-3f"))) {
} // Ja ir īpašvārds ar -a -e galotni, tad mēģina arī vīriešu dzimtes variantus uzvārdiem
else
continue; // citos gadījumos, ja beigu burti izskatās neadekvāti tam, kas leksikonā pie paradigmas norādīts - tad neminam.
Wordform variants = new Wordform(word, null, ending);
variants.addAttribute(AttributeNames.i_Source, "minējums pēc galotnes");
variants.addAttribute(AttributeNames.i_Guess, AttributeNames.v_Ending);
// FIXME ko ar pārējiem variantiem?? un ko ja nav variantu?
Ending pamatforma = ending.getLemmaEnding();
if (pamatforma != null) {
// Izdomājam korektu lemmu
String lemma = celms + pamatforma.getEnding();
lemma = recapitalize(lemma, originalWord);
variants.addAttribute(AttributeNames.i_Lemma, lemma);
}
if (((this.guessNouns && ending.getParadigm().isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Noun) &&
(enableVocative || !variants.isMatchingStrong(AttributeNames.i_Case, AttributeNames.v_Vocative)) &&
(guessInflexibleNouns || !variants.isMatchingStrong(AttributeNames.i_Declension, AttributeNames.v_NA))
) ||
(this.guessVerbs && ending.getParadigm().isMatchingWeak(AttributeNames.i_PartOfSpeech, AttributeNames.v_Verb)) ||
(this.guessAdjectives && ending.getParadigm().isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Adjective)) ||
(this.guessParticiples && variants.isMatchingStrong(AttributeNames.i_Mood, AttributeNames.v_Participle)) ||
(this.guessNouns && this.guessInflexibleNouns && variants.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Residual))
)
&& (i > 0 || variants.isMatchingStrong(AttributeNames.i_Declension, AttributeNames.v_NA)
|| variants.isMatchingStrong(AttributeNames.i_Declension, AttributeNames.v_InflexibleGenitive)
|| variants.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Residual)
)) // ja galotnes nav, tad vai nu nelokāms lietvārds vai neatpazīstam. Lai nav verbu bezgalotņu formas minējumos, kas parasti nav pareizās.
{
if (variants.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Residual)) {
char last = celms.charAt(celms.length() - 1);
if (Character.isDigit(last)) {
variants.removeAttribute(AttributeNames.i_ResidualType); // defaultais AttributeNames.v_Foreign te neatblist
}
}
rezultāts.wordforms.add(variants);
}
}
}
if (rezultāts.isRecognized() && !enableAllGuesses) {
// FIXME - šo te vajag aizstāt ar kādu heiristiku, kas atrastu, piemēram, ticamākos lietvārdvariantus, ticamākos īpašībasvārdagadījumus utml.
if (!word.endsWith("o")) // mēdz būt nelokāmi -o lietvārdi - bez galotnes, pretstatā dažām -o formām
break;
}
}
return rezultāts;
}
/**
* Performs morphological analysis, assuming that we know explicitly that the form is a lemma
* @param word
* @return
*/
public Word analyzeLemma(String word) {
Word result = new Word(word);
Word varianti = analyze(word);
for (Wordform vārdforma : varianti.wordforms) {
Ending ending = vārdforma.getEnding();
if ( (ending != null && ending.getLemmaEnding() == ending) ||
(vārdforma.getValue(AttributeNames.i_Lemma).equalsIgnoreCase(word) && (
vārdforma.isMatchingStrong(AttributeNames.i_NumberSpecial, AttributeNames.v_PlurareTantum) ||
vārdforma.isMatchingStrong(AttributeNames.i_EntryProperties, AttributeNames.v_Plural) ) ) )
result.addWordform(vārdforma);
}
return result;
}
/**
* Provides a list of paradigms that might be suitable for a given lemma
* The guessing restrictions for stem final letters and closed paradigms will be obeyed except for exceptions listed in lexicon
* @param lemma the lemma that should be reviewed. Plural forms will be treated as possibly valid for the case of plurare tantum
* @return a list of Paradigm objects which are possible for this case.
*/
public List suitableParadigms(String lemma) {
List result = new ArrayList<>();
Word lexicon_options = this.analyze(lemma);
Word all_options = this.guessByEnding(lemma.toLowerCase().trim(), lemma); // All analysis options as a starting point
for (Wordform wf : lexicon_options.wordforms) {
all_options.addWordform(wf); // form a joint list of both known words from lexicon and also pure guessing
}
AttributeValues pluraretantum = new AttributeValues();
pluraretantum.addAttribute(AttributeNames.i_PartOfSpeech, AttributeNames.v_Noun);
pluraretantum.addAttribute(AttributeNames.i_Case, AttributeNames.v_Nominative);
pluraretantum.addAttribute(AttributeNames.i_Number, AttributeNames.v_Plural);
for (Wordform option : all_options.wordforms) {
Ending ending = option.getEnding();
if ((ending != null && ending.getLemmaEnding() == ending) || option.isMatchingWeak(pluraretantum)) {
result.add(ending.getParadigm());
}
}
// sort list according to statistical frequency, and remove duplicates
Set result_set = new TreeSet(new Comparator() { //comparator to eliminate duplicates
@Override
public int compare(Paradigm a, Paradigm b) {
return a.getID() - b.getID();
}
});
result_set.addAll(result);
result = new ArrayList<>(result_set);
Collections.sort(result, new ParadigmFrequencyComparator()); //comparator for statistical frequency
Collections.reverse(result); // We want the list in order of descending frequency
return result;
}
public void setCacheSize (int maxSize) {
wordCache.setSize(maxSize);
}
public void clearCache () {
wordCache.clear();
}
public ArrayList generateInflections(String lemma) {
return generateInflections(lemma, false);
}
public ArrayList generateInflections(String lemma, boolean nouns_only) {
return generateInflections(lemma, nouns_only, new AttributeValues());
}
public ArrayList generateInflections(String lemma, boolean nouns_only, AttributeValues filter) {
//Vispirms, pārbaudam specgadījumu - dubultuzvārdus
if (p_doublesurname.matcher(lemma).matches()) {
int hyphen = lemma.indexOf("-");
AttributeValues part_filter = new AttributeValues(filter); // relax filter conditions for the first part, as it can have different endings than the whole compound surname
part_filter.removeAttribute(AttributeNames.i_Lemma);
ArrayList inflections2 = generateInflections(lemma.substring(hyphen+1, lemma.length()), nouns_only, part_filter);
part_filter.removeAttribute(AttributeNames.i_Declension);
part_filter.removeAttribute(AttributeNames.i_ParadigmID);
if (part_filter.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Residual)) {
part_filter.removeAttribute(AttributeNames.i_PartOfSpeech);
part_filter.removeAttribute(AttributeNames.i_ResidualType);
}
ArrayList inflections1 = generateInflections(lemma.substring(0, hyphen), nouns_only, part_filter);
if ( (inflections1.size()>1 && inflections2.size()>1)) // Ja sanāk nelokāms kautkas, tad nemēģinam taisīt kā dubultuzvārdu - tie ir ļoti reti un tas salauztu vairāk nekā iegūtu
return mergeInflections(inflections1, inflections2, "-"); // TODO - unittestos ir aizkomentēti piemēri Pavļuta-Deslandes un Freiberga-Žverelo, kas šo testētu
}
Word possibilities = this.analyze(lemma);
filterInflectionPossibilities(nouns_only, filter, possibilities.wordforms);
ArrayList result = generateInflections_TryLemmas(lemma, possibilities);
if (result != null) filterInflectionPossibilities(nouns_only, filter, result);
// If result is null, it means that all the suggested lemma can be (and was) generated from another lemma - i.e. "Dīcis" from "dīkt"; but not from an existing lexicon lemma
// We assume that a true lemma was passed by the caller, and we need to generate/guess the wordforms as if the lemma was correct.
if ((result == null || result.size()==0) && this.enableGuessing) {
possibilities = this.guessByEnding(lemma.toLowerCase(), lemma);
filterInflectionPossibilities(nouns_only, filter, possibilities.wordforms);
result = generateInflections_TryLemmas(lemma, possibilities);
}
// If guessing didn't work, return an empty list
if (result == null)
result = new ArrayList();
return result;
}
// Ņemam divas locījumu kopas un apvienojam vienā .... pašreiz pielietojums tikai dubultuzvārdiem, pēc tam varbūt vēl kaut kur (frāzes?)
private ArrayList mergeInflections(
ArrayList inflections1, ArrayList inflections2,
String concatenator) {
ArrayList result = new ArrayList();
if (inflections1.size() <= 1) {
// Specgadījums - pirmais ir nelokāms
String fixedtoken = "???";
String fixedlemma = "???";
if (inflections1.size() > 0) {
fixedtoken = inflections1.get(0).getToken();
fixedlemma = inflections1.get(0).getValue(AttributeNames.i_Lemma);
}
for (Wordform otrā : inflections2) {
Wordform apvienojums = (Wordform) otrā.clone(); // Pamatinfo no otrās daļas, jo tā itkā ir gramatiski dominējoša
apvienojums.setToken(fixedtoken + concatenator + apvienojums.getToken());
apvienojums.addAttribute(AttributeNames.i_Lemma, fixedlemma + concatenator + apvienojums.getValue(AttributeNames.i_Lemma));
// TODO - vēl kautkas?
result.add(apvienojums);
}
} else if (inflections2.size() <= 1) {
// Specgadījums - otrais ir nelokāms
String fixedtoken = "???";
String fixedlemma = "???";
if (inflections2.size() > 0) {
fixedtoken = inflections2.get(0).getToken();
fixedlemma = inflections2.get(0).getValue(AttributeNames.i_Lemma);
}
for (Wordform pirmā : inflections1) {
Wordform apvienojums = (Wordform) pirmā.clone(); // Pamatinfo no otrās daļas, jo tā itkā ir gramatiski dominējoša
apvienojums.setToken(apvienojums.getToken() + concatenator + fixedtoken);
apvienojums.addAttribute(AttributeNames.i_Lemma, apvienojums.getValue(AttributeNames.i_Lemma) + concatenator + fixedlemma);
// TODO - vēl kautkas?
result.add(apvienojums);
}
} else {
// Normālais gadījums, kad vajag prātīgi apvienot
for (Wordform pirmā : inflections1) {
AttributeValues filter = new AttributeValues();
// Pieņemam, ka te tikai lietvārdi apgrozīsies
filter.addAttribute(AttributeNames.i_Case, pirmā.getValue(AttributeNames.i_Case));
filter.addAttribute(AttributeNames.i_Number, pirmā.getValue(AttributeNames.i_Number));
ArrayList possibilities = (ArrayList) inflections2.clone();
filterInflectionPossibilities(true, filter, possibilities);
if (possibilities.size() == 0) {
// Debuginfo
// System.err.println("Problēma ar dubultuzvārdu locīšanu - nesanāca dabūt atbilstošu 'pārīti' šim te pirmās daļas locījumam");
// pirmā.describe(new PrintWriter(System.err));
// System.err.println(".. no šīs te kopas otrās daļas locījumu");
// for (Wordform otrā : inflections2) {
// otrā.describe(new PrintWriter(System.err));
// System.err.println(" --");
// }
} else {
if ((!pirmā.isMatchingStrong(AttributeNames.i_Case, AttributeNames.v_Vocative) && possibilities.size() > 1) || possibilities.size() > 2) {
// Debuginfo
// System.err.println("Problēma ar dubultuzvārdu locīšanu - par daudz atbilstošu 'pārīšu' šim te pirmās daļas locījumam");
// pirmā.describe(new PrintWriter(System.err));
// System.err.println(".. no šīs te kopas otrās daļas locījumu");
// for (Wordform otrā : inflections2) {
// otrā.describe(new PrintWriter(System.err));
// System.err.println(" --");
// }
}
Wordform apvienojums = (Wordform) possibilities.get(0).clone(); // Pamatinfo no otrās daļas, jo tā itkā ir gramatiski dominējoša
apvienojums.setToken(pirmā.getToken() + concatenator + apvienojums.getToken());
apvienojums.addAttribute(AttributeNames.i_Lemma, pirmā.getValue(AttributeNames.i_Lemma) + concatenator + apvienojums.getValue(AttributeNames.i_Lemma));
// TODO - vēl kautkas?
result.add(apvienojums);
}
}
}
return result;
}
// generate all forms if the paradigm # is known
// TODO - needs more support for extra features (fixed-genitives, etc)
public ArrayList generateInflectionsFromParadigm(String lemma, int paradigm, AttributeValues lemmaAttributes) {
Paradigm p = this.paradigmByID(paradigm);
if (p == null)
return generateInflections(lemma); // If the supplied paradigm is invalid, we ignore it
if (p.getStems() > 1) // For 1st conjugation verbs, lemma is not enough info to inflect properly
return generateInflections(lemma); // Assume that it will be in current lexicon..
Ending ending = p.getLemmaEnding(); // We expect that the lemma will be the default lemma, unless...
// if attributes list plurare tantum, then we look for plural nominative as the lemma
if ((lemmaAttributes.isMatchingStrong(AttributeNames.i_NumberSpecial, AttributeNames.v_PlurareTantum) ||
lemmaAttributes.isMatchingStrong(AttributeNames.i_EntryProperties, AttributeNames.v_Plural))
&& !ending.isMatchingWeak(AttributeNames.i_Number, AttributeNames.v_Plural)) {
// Assuming that there will be only one plural nominative entry in case of daudzskaitlinieki
AttributeValues plural_nominative = new AttributeValues();
plural_nominative.addAttribute(AttributeNames.i_Number, AttributeNames.v_Plural);
plural_nominative.addAttribute(AttributeNames.i_Case, AttributeNames.v_Nominative);
if (ending.getParadigm().isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Adjective)) {
plural_nominative.addAttribute(AttributeNames.i_Definiteness, AttributeNames.v_Indefinite);
plural_nominative.addAttribute(AttributeNames.i_Gender, AttributeNames.v_Masculine);
}
for (Ending candidate_ending : ending.getParadigm().endings) {
if (candidate_ending.isMatchingStrongOneSide(plural_nominative)
&& lemma.endsWith(candidate_ending.getEnding())) {
ending = candidate_ending;
}
}
}
// if attributes list feminine gender, then we look for feminine singular nominative as the lemma
if (lemmaAttributes.isMatchingStrong(AttributeNames.i_Gender, AttributeNames.v_Feminine)
&& !ending.isMatchingWeak(AttributeNames.i_Gender, AttributeNames.v_Feminine)) {
// Assuming that there will be only one fitting form
AttributeValues feminine_lemma = new AttributeValues();
feminine_lemma.addAttribute(AttributeNames.i_Number, AttributeNames.v_Singular);
feminine_lemma.addAttribute(AttributeNames.i_Case, AttributeNames.v_Nominative);
feminine_lemma.addAttribute(AttributeNames.i_Gender, AttributeNames.v_Feminine);
if (ending.getParadigm().isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Adjective)) {
feminine_lemma.addAttribute(AttributeNames.i_Definiteness, AttributeNames.v_Indefinite);
}
for (Ending candidate_ending : ending.getParadigm().endings) {
if (candidate_ending.isMatchingStrong(feminine_lemma)
&& lemma.endsWith(candidate_ending.getEnding())) {
ending = candidate_ending;
}
}
}
// did not found appropriate ending
if (ending == null || !lemma.endsWith(ending.getEnding())) {
System.err.printf("Attempted to generate inflections for lemma '%s' at paradigm '%d'; failed because of mismatched ending\n", lemma, paradigm);
}
if (ending == null) return new ArrayList<>();
Lexeme l = this.createLexeme(lemma, ending, "temp");
if (l == null) { // Couldn't create the lexeme - the word wasn't compatible with the supplied paradigm
return new ArrayList();
}
l.addAttributes(lemmaAttributes);
ArrayList result = generateInflections(l, lemma);
filterInflectionPossibilities(false, null, result);
p.removeLexeme(l); // To not pollute the in-memory lexicon - FIXME - this temporary lexeme does have temporary pollution which could have multithreading race conditions
return result;
}
// generate all forms if the paradigm # is known
public ArrayList generateInflectionsFromParadigm(String lemma, int paradigm) {
return generateInflectionsFromParadigm(lemma, paradigm, new AttributeValues());
}
public ArrayList generateInflectionsFromParadigm(String lemma, int paradigm, String stem1, String stem2, String stem3){
return generateInflectionsFromParadigm(lemma, paradigm, stem1, stem2, stem3, new AttributeValues());
}
// generate all forms if the paradigm # and also the three lemmas (for 1st conjugation) are known
// FIXME - DRY, repeats previous function
public ArrayList generateInflectionsFromParadigm(String lemma, int paradigm, String stem1, String stem2, String stem3, AttributeValues lemmaAttributes) {
Paradigm p = this.paradigmByID(paradigm);
if (p == null)
return generateInflections(lemma); // If the supplied paradigm is invalid, we ignore it
// if (p.getStems() == 1) // If it's not 1st conjugation verb, perform as if we didn't know the stems
// return generateInflectionsFromParadigm(lemma, paradigm, lemmaAttributes);
if (!lemma.endsWith(p.getLemmaEnding().getEnding())) {
//FIXME - should check for plural nouns, etc
}
Ending e = p.getLemmaEnding();
String normallemma = stem1 + e.getEnding();
Lexeme l = this.createLexeme(normallemma, e, "temp");
l.addAttribute(AttributeNames.i_Lemma, lemma);
l.addAttributes(lemmaAttributes);
if (l == null) { // Couldn't create the lexeme - the word wasn't compatible with the supplied paradigm
return new ArrayList();
}
l.setStem(0, stem1);
if (p.getStems()>1) {
l.setStem(1, stem2);
l.setStem(2, stem3);
}
ArrayList result = generateInflections(l, lemma);
filterInflectionPossibilities(false, null, result);
p.removeLexeme(l); // To not pollute the in-memory lexicon
return result;
}
// removes possibilities that aren't nouns/substantivised adjectives, and don't match the filter
public void filterInflectionPossibilities(boolean nouns_only, AttributeValues filter, ArrayList possibilities) {
ArrayList unsuitable = new ArrayList();
for (Wordform wf : possibilities) {
// "nouns_only" filter and its exceptions
boolean suitable = ! nouns_only; // if nouns_only, then we want to test for partofspeech, if not, then okay by default
if (wf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Noun)) suitable = true;
if (wf.isMatchingStrong(AttributeNames.i_Conversion, AttributeNames.v_Noun)) suitable = true;
if (wf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Adjective) &&
wf.isMatchingStrong(AttributeNames.i_Definiteness, AttributeNames.v_Definite)) suitable = true;
if (wf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Residual) &&
wf.isMatchingStrong(AttributeNames.i_ResidualType, AttributeNames.v_Foreign)) suitable = true; // visādi Vadim, Kirill utml
// ------ end of nouns_only exceptions
/* Now implemented with flag 'Morfotabulas attēlošana'
if (wf.isMatchingStrong(AttributeNames.i_ProperNounType, AttributeNames.v_Toponym) &&
wf.isMatchingStrong(AttributeNames.i_Number, AttributeNames.v_Plural) &&
!wf.isMatchingStrong(AttributeNames.i_NumberSpecial, AttributeNames.v_PlurareTantum)
) suitable = false; // Do not generate plural forms of singular toponyms
*/
if (wf.isMatchingStrong(AttributeNames.i_EntryProperties, AttributeNames.v_EntryComparative) &&
wf.isMatchingStrong(AttributeNames.i_Degree, AttributeNames.v_Positive)
) suitable = false; // Do not generate positive forms of comparative/superlative adjectives
if (!wf.isMatchingWeak(filter) &&
!wf.isMatchingStrong(AttributeNames.i_ResidualType, AttributeNames.v_Foreign) &&
!wf.isMatchingStrong(AttributeNames.i_Declension, AttributeNames.v_NA)
) suitable = false; //filter overrides everything except inflexible stuff
if (!suitable) unsuitable.add(wf);
}
possibilities.removeAll(unsuitable);
}
// TODO - needs refactoring and unittests
// Attempts to find the "proper lemma" out of analysis options provided, possibly making a new lexeme if needed, and then generate the inflections from that lemma
public ArrayList generateInflections_TryLemmas(String lemma, Word w) {
for (Wordform wf : w.wordforms) {
if (wf.isMatchingStrong(AttributeNames.i_Case, AttributeNames.v_Vocative))
continue; // Vocatives often match lemmas and are false positives
// Pamēģinam katru no analīzes variantiem, vai viņš ir pamatforma (atbilst vajadzīgajai lemmai)
Lexeme lex = wf.lexeme;
// The regular case where lemmas must be "normal"
if (wf.getValue(AttributeNames.i_Lemma).equalsIgnoreCase(lemma) ||
lemma.equalsIgnoreCase(wf.getValue(AttributeNames.i_LemmaParadigm)) ) {
if (lex == null || !lex.getValue(AttributeNames.i_Lemma).equalsIgnoreCase(lemma)) { // NB! this is lex.lemma not wf.lemma that's checked earlier
// Ja nav pareizā leksēma (atvasināšana vai minēšana) tad uztaisam leksēmu
Ending ending = wf.getEnding();
if (wf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Adverb))
ending = this.paradigmByName("adverb").getLemmaEnding();
// FIXME - es te iekodēju izņēmumgadījumu jo nevaru saprast kā pareizāk darīt vispārīgi
if (lemma.endsWith("šana") && wf.getEnding().getParadigm().isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Verb)) {
ending = this.paradigmByName("noun-4f").getLemmaEnding();
}
lex = this.createLexeme(lemma, ending, "generateInflectionsFromParadigm"); // Temporary lexeme
if (lex.getValue(AttributeNames.i_PartOfSpeech) == null)
lex.addAttribute(AttributeNames.i_PartOfSpeech, wf.getValue(AttributeNames.i_PartOfSpeech)); // Hardcoded vārdšķirai lai ir POS - saīsinājumi utml
if (p_firstcap.matcher(lemma).matches())
lex.addAttribute(AttributeNames.i_NounType, AttributeNames.v_ProperNoun); //FIXME - hack personvārdu 'Valdis' utml locīšanai
if (wf.getEnding().getParadigm().getStems() > 1 && wf.lexeme != null && wf.getValue(AttributeNames.i_Prefix) != null) { // Priedēkļu atvasināšanai priedēklis jāpieliek arī pārējiem celmiem
lex.setStem(1, wf.getValue(AttributeNames.i_Prefix) + wf.lexeme.getStem(1));
lex.setStem(2, wf.getValue(AttributeNames.i_Prefix) + wf.lexeme.getStem(2));
}
}
ArrayList result = generateInflections(lex, lemma);
if (lex.isMatchingStrong(AttributeNames.i_Source, "generateInflectionsFromParadigm"))
lex.getParadigm().removeLexeme(lex); // removed temporary lexeme
return result;
}
if (lemma.startsWith(this.NEGATION_PREFIX) && lemma.equalsIgnoreCase(this.NEGATION_PREFIX + wf.getValue(AttributeNames.i_Lemma)) && lex != null) {
// inflection of negated verbs/participles
return generateInflections(lex, lemma);
}
// The case for nominalized adjectives such as adjective-derived surnames
if ( wf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Adjective) && (
(lemma.toLowerCase().endsWith("ais") && lemma.equalsIgnoreCase(wf.getValue(AttributeNames.i_Lemma).substring(0, wf.getValue(AttributeNames.i_Lemma).length()-1)+"ais")) ||
(lemma.toLowerCase().endsWith("ā") && wf.getValue(AttributeNames.i_Lemma).equalsIgnoreCase(lemma.substring(0, lemma.length()-1)+"s") && wf.isMatchingStrong(AttributeNames.i_Gender, AttributeNames.v_Feminine)) ) ) {
// Exception for adjective-based surnames "Lielais", "Platais" etc
if ((lex == null && lemma.toLowerCase().endsWith("ais")) || (lex != null && !lex.getValue(AttributeNames.i_Lemma).equalsIgnoreCase(lemma))) {
lex = this.createLexeme(lemma, wf.getEnding(), "generateInflectionsFromParadigm");
if (p_firstcap.matcher(lemma).matches())
lex.addAttribute(AttributeNames.i_NounType, AttributeNames.v_ProperNoun); //FIXME - hack personvārdu 'Valdis' utml locīšanai
}
if (lex == null) continue;
ArrayList result = new ArrayList();
for (Wordform wf2 : generateInflections(lex, lemma)) {
if (wf2.isMatchingStrong(AttributeNames.i_Definiteness, AttributeNames.v_Definite) && wf2.isMatchingStrong(AttributeNames.i_Degree, AttributeNames.v_Positive) && wf2.isMatchingWeak(AttributeNames.i_Gender, wf.getValue(AttributeNames.i_Gender))) {
result.add(wf2);
}
}
if (lex.isMatchingStrong(AttributeNames.i_Source, "generateInflectionsFromParadigm"))
lex.getParadigm().removeLexeme(lex);
return result;
}
}
return null;
}
public ArrayList generateInflections(Lexeme lexeme, String lemma)
{
String trešāSakne = null, vārds;
//Vārds rezultāts = new Vārds(leksēma.īpašības.Īpašība(IpasibuNosaukumi.i_Pamatforma));
ArrayList inflections = new ArrayList(1);
//priekš 1. konj nākotnes mijas nepieciešams zināt 3. sakni
if (lexeme.getParadigm().getStems() == 3) {
trešāSakne = lexeme.getStem(2);
}
boolean noliegums = lemma.equalsIgnoreCase(this.NEGATION_PREFIX+lexeme.getValue(AttributeNames.i_Lemma));
for (Ending ending : lexeme.getParadigm().endings){
if ( ending.getValue(AttributeNames.i_PartOfSpeech)==null ||
ending.getValue(AttributeNames.i_PartOfSpeech).equals(lexeme.getValue(AttributeNames.i_PartOfSpeech)) ||
lexeme.getValue(AttributeNames.i_PartOfSpeech) == null) {
boolean vispārākāPak = ending.isMatchingStrong(AttributeNames.i_Definiteness, AttributeNames.v_Definite) ||
ending.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Adverb);
boolean properName = lexeme.isMatchingStrong(AttributeNames.i_NounType, AttributeNames.v_ProperNoun);
ArrayList celmi = Mijas.MijasLocīšanai(lexeme.getStem(ending.stemID-1), ending.getMija(), trešāSakne, vispārākāPak, properName);
for (Variants celms : celmi){
vārds = celms.celms + ending.getEnding();
if (noliegums) {
if (vārds.startsWith(this.SUPERLATIVE_PREFIX) && celms.isMatchingStrong(AttributeNames.i_Degree, AttributeNames.v_Superlative)) {
vārds = this.SUPERLATIVE_PREFIX + this.NEGATION_PREFIX + vārds.substring(this.SUPERLATIVE_PREFIX.length());
} else vārds = this.NEGATION_PREFIX + vārds;
}
vārds = recapitalize(vārds, lemma);
Wordform locījums = new Wordform(vārds, lexeme, ending);
locījums.addAttributes(celms);
boolean validOption = locījums.isMatchingWeak(AttributeNames.i_Generate, AttributeNames.v_Yes);
if (locījums.isMatchingStrong(AttributeNames.i_NumberSpecial, AttributeNames.v_PlurareTantum) && locījums.isMatchingStrong(AttributeNames.i_Number, AttributeNames.v_Singular)) validOption = false;
if (locījums.isMatchingStrong(AttributeNames.i_NumberSpecial, AttributeNames.v_SingulareTantum) && locījums.isMatchingStrong(AttributeNames.i_Number, AttributeNames.v_Plural)) validOption = false;
if (GenerationBlacklist.blacklist(locījums)) validOption = false;
if (noliegums) locījums.addAttribute(AttributeNames.i_Noliegums, AttributeNames.v_Yes);
if ((locījums.isMatchingStrong(AttributeNames.i_Noliegums, AttributeNames.v_Yes) ||
lexeme.getStem(0).equalsIgnoreCase("vajadzē")) &&
(locījums.isMatchingStrong(AttributeNames.i_Mood, AttributeNames.v_DebitiveQuotative)
|| locījums.isMatchingStrong(AttributeNames.i_Mood, AttributeNames.v_Debitive))) validOption = false;
// Īpašības vārdi ar sieviešu dzimti bet vīriešu galotnēm - ālava / ālavs, tāpat arī skaitļa vārdu novecojošās formas 'tūkstošām'
if ((locījums.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Adjective) ||
locījums.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Numeral) )&&
lexeme.isMatchingStrong(AttributeNames.i_Gender, AttributeNames.v_Feminine) &&
ending.isMatchingStrong(AttributeNames.i_Gender, AttributeNames.v_Masculine)) validOption = false;
if (validOption) inflections.add(locījums);
}
}
}
if (lexeme.getParadigm().isMatchingStrong(AttributeNames.i_ParadigmProperties, AttributeNames.v_OnlyHardcodedWordforms)) {
inflections = new ArrayList(1); // Šai gadījumā mēs nepieliekam "galotņu vārdformu nemaz
}
// Pārbaudam, vai šai lemmai nav kāds hardcoded formas override (piemēram, kā formai viņš *ej -> viņš iet)
Collection hc_forms = this.hardcodedForms.get(lemma);
if (hc_forms.isEmpty() && lemma.startsWith(this.NEGATION_PREFIX) && (lemma.endsWith("t") || lemma.endsWith("ties"))) {
hc_forms = this.hardcodedForms.get(lemma.substring(2));
}
for (Lexeme formLexeme : hc_forms) {
Ending ending = formLexeme.getParadigm().getLemmaEnding();
Wordform hardcoded = new Wordform(formLexeme.getStem(0), formLexeme, ending);
if (!hardcoded.isMatchingWeak(AttributeNames.i_Generate, AttributeNames.v_Yes))
continue;
if (!lexeme.getParadigm().isMatchingWeak(AttributeNames.i_PartOfSpeech, hardcoded.getValue(AttributeNames.i_PartOfSpeech)))
continue;
if (hardcoded.isMatchingStrong(AttributeNames.i_Noliegums, AttributeNames.v_Yes) && !lemma.startsWith(this.NEGATION_PREFIX))
continue;
if (hardcoded.isMatchingStrong(AttributeNames.i_Noliegums, AttributeNames.v_No) && lemma.startsWith(this.NEGATION_PREFIX))
continue;
if (!hardcoded.isMatchingStrong(AttributeNames.i_ExtraForm, AttributeNames.v_Yes)) {
Wordform override = null;
for (Wordform form : inflections) { // pārbaudam, vai kādu no esošajiem locījumiem nevajag izmest, jo šis hardcoded variants to aizvieto
if (form.isMatchingWeak(formLexeme)) {
override = form;
}
}
if (override != null) {
inflections.remove(override);
}
}
inflections.add(hardcoded);
}
// For verbs, generate also negated forms
if (!noliegums && lexeme.getParadigm().isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Verb) && !lexeme.isMatchingStrong(AttributeNames.i_Noliegums, AttributeNames.v_Yes)) {
ArrayList negated_inflections = generateInflections(lexeme,this.NEGATION_PREFIX+lexeme.getValue(AttributeNames.i_Lemma));
inflections.addAll(negated_inflections);
}
return inflections;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy