lv.semti.morphology.analyzer.Analyzer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of morphology Show documentation
Show all versions of morphology Show documentation
Latvian morphological analysis library
/*******************************************************************************
* Copyright 2008, 2009, 2014 Institute of Mathematics and Computer Science, University of Latvia
* Author: Pēteris Paikens
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*******************************************************************************/
package lv.semti.morphology.analyzer;
import java.io.PrintWriter;
import java.util.*;
import java.util.regex.Pattern;
import lv.semti.morphology.attributes.AttributeNames;
import lv.semti.morphology.attributes.AttributeValues;
import lv.semti.morphology.corpus.ParadigmFrequencyComparator;
import lv.semti.morphology.lexicon.*;
public class Analyzer extends Lexicon {
public boolean enablePrefixes = true;
public boolean meklētsalikteņus = false;
public boolean enableGuessing = false;
public boolean enableDiminutive = true;
public boolean enableDerivedNouns = true; // FIXME - šim vajag saprast korektu terminu
public boolean enableVocative = false;
public boolean guessNouns = true;
public boolean guessVerbs = true;
public boolean guessParticiples = true;
public boolean guessAdjectives = true;
public boolean enableAllGuesses = false;
public boolean guessInflexibleNouns = true;
public boolean removeRareWords = true;
public boolean removeRegionalWords = true; // Ignore regiona/dialect forms as they tend to produce unexpected overlap with forms of other common words
private Pattern p_number = Pattern.compile("[\\d., ]*[\\d+⁰¹²³⁴⁵⁶⁷⁸⁹₀₁₂₃₄₅₆₇₈₉]([.,][-‐‑‒–—―])?");
private Pattern p_ordinal = Pattern.compile("\\d+\\.");
private Pattern p_fractional = Pattern.compile("\\d+[\\\\/]\\d+");
private Pattern p_abbrev = Pattern.compile("\\w+\\.");
private Pattern p_abbrev_caps = Pattern.compile("\\p{Lu}+\\."); // abbreviation in all caps
private Pattern p_acronym = Pattern.compile("(\\p{Lu}){2,5}"); // all caps, repeated 2-5 times
private Pattern p_letter = Pattern.compile("(\\p{L})"); // an isolated letter
private Pattern p_url = Pattern.compile("((ht|f)tps?://)?[.\\w-]+\\.(lv|com|org|gov)(/[\\w\\d-@:?=&%.]*)?");
private Cache wordCache = new Cache();
/**
* Construct the morphological analyzer object by loading the lexicon from either the default location, a specified file name or an inputstream.
* @throws Exception
*/
public Analyzer () throws Exception {
super();
}
public Analyzer (boolean useAuxiliaryLexicons) throws Exception {
super(useAuxiliaryLexicons);
}
public Analyzer (String lexiconFileName) throws Exception {
super(lexiconFileName);
}
/**
* Loads the analyzer lexicon from the specified file
* @param lexiconFileName - main lexicon file name
* @param useAuxiliaryLexicons should secondary lexicon files be included in addition to the core lexicons
*/
public Analyzer(String lexiconFileName, boolean useAuxiliaryLexicons) throws Exception{
super(lexiconFileName, useAuxiliaryLexicons);
}
/**
* Loads the analyzer lexicon from the specified file, but excludes a blacklist of sub-lexicons when doing so
* @param lexiconFileName filename of the core lexicon
* @param blacklist list of sub-lexicon file names to skip from loading
*/
public Analyzer(String lexiconFileName, ArrayList blacklist) throws Exception{
super(lexiconFileName, blacklist);
}
/* TODO - salikteņu minēšana jāuzaisa
private boolean DerSalikteņaSākumam(Ending ending) {
if (ending.getParadigm().isMatchingStrong(AttributeNames.i_PartOfSpeech,AttributeNames.v_Noun))
return ending.isMatchingStrong(AttributeNames.i_Case,AttributeNames.v_Genitive);
return false;
} */
public void defaultSettings(){
enablePrefixes = true;
meklētsalikteņus = false;
enableGuessing = false;
enableDiminutive = true;
enableDerivedNouns = true;
enableVocative = false;
guessNouns = true;
guessVerbs = true;
guessParticiples = true;
guessAdjectives = true;
guessAllParadigms = false;
enableAllGuesses = false;
guessInflexibleNouns = true;
removeRareWords = true;
removeRegionalWords = true;
}
public void describe(PrintWriter pipe) {
pipe.format("enableGuessing:\t%b\n", enableGuessing);
pipe.format("enablePrefixes:\t%b\n", enablePrefixes);
pipe.format("enableDiminutive:\t%b\n", enableDiminutive);
pipe.format("enableVocative:\t%b\n", enableVocative);
pipe.format("enableAllGuesses:\t%b\n", enableAllGuesses);
pipe.format("meklētsalikteņus:\t%b\n", meklētsalikteņus);
pipe.format("guessNouns:\t\t%b\n", guessNouns);
pipe.format("guessVerbs:\t\t%b\n", guessVerbs);
pipe.format("guessParticiples:\t%b\n", guessParticiples);
pipe.format("guessAdjectives:\t%b\n", guessAdjectives);
pipe.format("guessInflexibleNouns:\t%b\n", guessInflexibleNouns);
pipe.format("guessAllParadigms:\t%b\n", guessAllParadigms);
pipe.flush();
}
/**
* Performs a morphological analysis of an arbitrary token according to the configuration parameters set in this
* Analyzer object.
* This method handles the caching and capitalization, and delegates the actual analysis to analyzeLowercase.
* @param word - the token string to be analyzed
* @return a Word object containing the possible analysis options
*/
public Word analyze(String word) {
word = word.trim();
Word cacheWord = wordCache.get(word);
if (cacheWord != null) return (Word) cacheWord.clone();
Word rezults = new Word(word);
if (!word.equals(word.toLowerCase().trim())) {
String lettercase = AttributeNames.v_Lowercase;
if (p_firstcap.matcher(word).matches()) lettercase = AttributeNames.v_FirstUpper;
if (p_allcaps.matcher(word).matches()) lettercase = AttributeNames.v_AllUpper;
Word lowercase = analyzeLowercase(word.toLowerCase().trim(), word);
for (Wordform vārdforma : lowercase.wordforms) {
vārdforma.setToken(word.trim());
vārdforma.addAttribute(AttributeNames.i_CapitalLetters, lettercase);
rezults.addWordform(vārdforma);
}
} else {
rezults = analyzeLowercase(word, word);
}
wordCache.put(word, (Word) rezults.clone());
return rezults;
}
/**
* Implements the actual core morphological analysis algorithm
* @param word - the lowercase form of the word
* @param originalWord - the word with the original capitalization
* @return a Word object containing the possible analysis options
*/
private Word analyzeLowercase(String word, String originalWord) {
Word result = new Word(word);
for (Ending ending : getAllEndings().matchedEndings(word)) {
String stemBezMijas;
try {
stemBezMijas = ending.stem(word);
} catch (Ending.WrongEndingException e) {
throw new Error(e); // Shouldn't ever happen - matchedEndings should ensure that word contains that ending.
}
int stemChange = ending.getMija();
boolean properName = p_firstcap.matcher(originalWord).matches();
ArrayList celmi = Mijas.mijuVarianti(stemBezMijas, stemChange, properName);
for (Variants celms : celmi) {
ArrayList lexemes = ending.getEndingLexemes(celms.celms);
boolean foundSomethingHere = false;
if (lexemes != null)
for (Lexeme lexeme : lexemes) {
String trešāSakne = stemBezMijas;
if (lexeme.getParadigm().getStems() == 3) {
trešāSakne = lexeme.getStem(2);
}
if (!Mijas.atpakaļlocīšanasVerifikācija(celms, stemBezMijas, stemChange, trešāSakne, properName))
continue;
Wordform variants = new Wordform(word, lexeme, ending, originalWord);
variants.addAttributes(celms);
variants.addAttribute(AttributeNames.i_Guess, AttributeNames.v_NoGuess);
if (variants.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Abbreviation) && p_allcaps.matcher(originalWord).matches())
variants.addAttribute(AttributeNames.i_Lemma, variants.getValue(AttributeNames.i_Lemma).toUpperCase());
if (this.isAcceptable(variants)) { // izmetam tos variantus, kas nav īsti pieļaujami - vienskaitliniekus daudzskaitlī, vokatīvus ja tos negrib
result.addWordform(variants);
foundSomethingHere = true;
}
}
if (!foundSomethingHere && enableDiminutive)
guessDeminutive(word, result, ending, celms, originalWord);
if (!foundSomethingHere && enableDerivedNouns)
guessDerivedNoun(word, result, ending, celms, originalWord);
}
}
if (result.isRecognized() && (removeRareWords || removeRegionalWords)) {
boolean hasNonrareOption = false;
for (Wordform wf : result.wordforms) {
if (removeRareWords && (wf.isMatchingStrong(AttributeNames.i_Frequency, AttributeNames.v_Rare) ||
wf.isMatchingStrong(AttributeNames.i_Usage, AttributeNames.v_Outdated)))
continue; // rare and removed
if (removeRegionalWords && (wf.isMatchingStrong(AttributeNames.i_Usage, AttributeNames.v_Regional) ||
wf.isMatchingStrong(AttributeNames.i_Usage, AttributeNames.v_RegionalOutdated)))
continue; // regional and removed
hasNonrareOption = true;
}
List to_remove = new LinkedList();
for (Wordform wf : result.wordforms) {
if (removeRareWords && hasNonrareOption && (
wf.isMatchingStrong(AttributeNames.i_Frequency, AttributeNames.v_Rare) ||
wf.isMatchingStrong(AttributeNames.i_Usage, AttributeNames.v_Outdated)
))
to_remove.add(wf); // we remove rare words only if there's a non-rare option remaining, i.e. only in overlap
if (removeRegionalWords && (wf.isMatchingStrong(AttributeNames.i_Usage, AttributeNames.v_Regional) ||
wf.isMatchingStrong(AttributeNames.i_Usage, AttributeNames.v_RegionalOutdated)))
to_remove.add(wf); // we remove regional words whenever the flag is set, even without overlap
}
result.wordforms.removeAll(to_remove);
}
if (!result.isRecognized()) { //Hardcoded izņēmumi (ar regex) kas atpazīst ciparus, kārtas skaitļus utml
Ending HARDCODED_ENDING = this.paradigmByName("hardcoded").getLemmaEnding();
if (HARDCODED_ENDING == null) {
System.err.println("Hardcoded ending not found");
return result;
}
if (p_number.matcher(word).matches()) {
Wordform wf = new Wordform(word, word, HARDCODED_ENDING, AttributeNames.v_Residual);
wf.addAttribute(AttributeNames.i_ResidualType, AttributeNames.v_Number);
result.addWordform(wf);
return result;
}
if (p_fractional.matcher(word).matches()) {
Wordform wf = new Wordform(word, word, HARDCODED_ENDING, AttributeNames.v_Residual);
wf.addAttribute(AttributeNames.i_ResidualType, AttributeNames.v_Number);
result.addWordform(wf);
return result;
}
if (p_ordinal.matcher(word).matches()) {
Wordform wf = new Wordform(word, word, HARDCODED_ENDING, AttributeNames.v_Residual);
wf.addAttribute(AttributeNames.i_ResidualType, AttributeNames.v_Ordinal);
result.addWordform(wf);
return result;
}
if (p_abbrev_caps.matcher(originalWord).matches() ||
p_abbrev.matcher(word).matches() ||
(enableGuessing && p_acronym.matcher(originalWord).matches())
) {
Ending ABBREV_ENDING = this.paradigmByName("abbr").getLemmaEnding();
if (ABBREV_ENDING == null) {
System.err.println("Abbreviation ending not found");
return result;
}
result.addWordform(new Wordform(word, word, ABBREV_ENDING, AttributeNames.v_Abbreviation));
return result;
}
if (p_letter.matcher(word).matches()) {
Wordform wf = new Wordform(word, word, HARDCODED_ENDING, AttributeNames.v_Residual);
result.addWordform(wf);
return result;
}
if (p_url.matcher(word).matches()) {
Wordform wf = new Wordform(word, word, HARDCODED_ENDING, AttributeNames.v_Residual);
wf.addAttribute(AttributeNames.i_ResidualType, AttributeNames.v_URI);
result.addWordform(wf);
return result;
}
}
if (enablePrefixes) {
if (!result.isRecognized()
|| (word.startsWith(this.NEGATION_PREFIX) && !result.hasAttribute(AttributeNames.i_PartOfSpeech, AttributeNames.v_Verb))) {
for (Wordform wf : guessByPrefix(word).wordforms) {
result.addWordform(wf);
}
}
// if (result.hasAttribute(AttributeNames.i_PartOfSpeech, AttributeNames.v_Verb))
}
/*
if (!rezultāts.isRecognized() && meklētsalikteņus )
for (Ending ending : allEndings())
if (DerSalikteņaSākumam(ending)) {
for (ArrayList pirmiecelmi : galotne.getVārdgrupa().leksēmaspēcVārda.get(galotne.saknesNr-1).values()) {
//FIXME - salikteņu meklēšana nav te ielikta
}
} */
if (!result.isRecognized() && enableGuessing )
result = guessByEnding(word, originalWord);
if (enableGuessing) {
boolean all_deminutives = true;
// We want to do full guessing also if there was a deminutive found - otherwise masc sg gen "Rāviņa" gets interpreted as deminutive of "rāva"
for (Wordform wf : result.wordforms) {
if (!wf.isMatchingStrong(AttributeNames.i_Guess, AttributeNames.v_Deminutive))
all_deminutives = false;
}
if (!result.isRecognized() || all_deminutives) result = guessByEnding(word, originalWord);
}
/*for (Wordform variants : rezultāts.wordforms) {
variants.addAttribute(AttributeNames.i_Tag, MarkupConverter.toKamolsMarkup(variants));
if (variants.lexeme != null) {
String locījumuDemo = "";
for (Wordform locījums : generateInflectionsFromParadigm(variants.lexeme)) {
locījumuDemo = locījumuDemo + locījums.getValue(AttributeNames.i_Word) + " " + locījums.getValue(AttributeNames.i_Case) + "\n";
}
variants.pieliktĪpašību("LocījumuDemo", locījumuDemo);
//TODO - kautko jau ar to visu vajag; bet bez īpašas vajadzības tas ir performancehog
}
} */
return result;
}
private void guessDerivedNoun(String word, Word result, Ending ending, Variants celms, String originalWord) {
// -tājs, -ējs, -tāja, -ēja
if (!ending.getParadigm().isMatchingStrong(AttributeNames.i_ParadigmSupportedDerivations, AttributeNames.v_Derivation_tājs_tāja_ējs_ēja))
return;
if (celms.celms.endsWith("tāj")) {
String verb_stem = celms.celms.substring(0,celms.celms.length()-3);
for (int paradigmID : new int[]{16, 17, 45}) {
Paradigm p = this.paradigmByID(paradigmID);
ArrayList lexemes = p.getLexemesByStem().get(0).get(verb_stem);
if (lexemes != null) {
for (Lexeme lexeme : lexemes) {
Wordform variants = new Wordform(word, lexeme, ending);
variants.addAttributes(celms); // TODO - iespējams ka šis ir lieks
variants.addAttribute(AttributeNames.i_Source, "-tājs/-tāja sufiksāls atvasinājums");
variants.addAttribute(AttributeNames.i_SourceLemma, lexeme.getValue(AttributeNames.i_Lemma));
variants.addAttribute(AttributeNames.i_Guess, AttributeNames.v_Deminutive);
String lemma = lexeme.getStem(0) + "tāj" + ending.getLemmaEnding().getEnding();
lemma = recapitalize(lemma, originalWord);
variants.addAttribute(AttributeNames.i_Lemma, lemma);
result.addWordform(variants);
}
}
}
} else if (celms.celms.endsWith("ēj")) {
Paradigm p = this.paradigmByID(15); // verb-1
ArrayList verb_stems = Mijas.mijuVarianti(celms.celms.substring(0,celms.celms.length()-2), 14,false); // 1. konj -is formas mija - manuprāt tas šeit ir pareizais
for (Variants verb_stem : verb_stems) {
ArrayList lexemes = p.getLexemesByStem().get(2).get(verb_stem.celms);
if (lexemes != null) {
for (Lexeme lexeme : lexemes) {
Wordform variants = new Wordform(word, lexeme, ending);
variants.addAttributes(verb_stem); // ?
variants.addAttribute(AttributeNames.i_Source, "-ējs/-ēja sufiksāls atvasinājums");
variants.addAttribute(AttributeNames.i_SourceLemma, lexeme.getValue(AttributeNames.i_Lemma));
variants.addAttribute(AttributeNames.i_Guess, AttributeNames.v_Deminutive);
String lemma = verb_stem.celms + "ēj" + ending.getLemmaEnding().getEnding();
lemma = recapitalize(lemma, originalWord);
variants.addAttribute(AttributeNames.i_Lemma, lemma);
result.addWordform(variants);
}
}
}
}
}
/**
* Attempts to verify if this word can be derived as a possible deminutive form of some noun in lexicon
* @param word
* @param rezultāts
* @param ending
* @param celms
* @param originalWord
*/
private void guessDeminutive(String word, Word rezultāts, Ending ending,
Variants celms, String originalWord) {
if (celms.celms.endsWith("īt") &&
ending.getParadigm().isMatchingStrong(AttributeNames.i_ParadigmSupportedDerivations, AttributeNames.v_Diminutive_īt)) {
ArrayList deminutīvleksēmas = ending.getEndingLexemes(celms.celms.substring(0,celms.celms.length()-2));
if (deminutīvleksēmas != null)
for (Lexeme leksēma : deminutīvleksēmas) {
Wordform variants = new Wordform(word, leksēma, ending);
variants.addAttributes(celms); // TODO - iespējams, ka šis ir lieks
variants.addAttribute(AttributeNames.i_Deminutive, "-īt-");
variants.addAttribute(AttributeNames.i_Source,"pamazināmo formu atvasināšana");
variants.addAttribute(AttributeNames.i_SourceLemma, leksēma.getValue(AttributeNames.i_Lemma));
variants.addAttribute(AttributeNames.i_Guess, AttributeNames.v_Deminutive);
String lemma = leksēma.getStem(0) + "īt" + ending.getLemmaEnding().getEnding();
lemma = recapitalize(lemma, originalWord);
variants.addAttribute(AttributeNames.i_Lemma, lemma);
rezultāts.addWordform(variants);
}
}
if (celms.celms.endsWith("iņ") &&
ending.getParadigm().isMatchingStrong(AttributeNames.i_ParadigmSupportedDerivations, AttributeNames.v_Diminutive_iņ)) {
String pamatforma = celms.celms.substring(0,celms.celms.length()-2);
String pamatforma2 = pamatforma;
if (pamatforma.endsWith("dz")) pamatforma2 = pamatforma.substring(0,pamatforma.length()-2)+"g";
if (pamatforma.endsWith("c")) pamatforma2 = pamatforma.substring(0,pamatforma.length()-1)+"k";
ArrayList deminutīvleksēmas = ending.getEndingLexemes(pamatforma2);
if (ending.getParadigm().getName().equalsIgnoreCase("noun-1b")) { // mainās deklinācija galds -> galdiņš, tāpēc īpaši
deminutīvleksēmas = this.paradigmByName("noun-1a").getLemmaEnding().getEndingLexemes(pamatforma2);
if (pamatforma.endsWith("l")) pamatforma2 = pamatforma.substring(0,pamatforma.length()-1)+"ļ";
ArrayList deminutīvleksēmas2 = ending.getEndingLexemes(pamatforma2);
// bet ir arī ceļš->celiņš, kur paliek 2. deklinācija
if (deminutīvleksēmas == null) deminutīvleksēmas = deminutīvleksēmas2;
else if (deminutīvleksēmas2 != null) deminutīvleksēmas.addAll(deminutīvleksēmas2);
}
if ((pamatforma.endsWith("ļ") && ending.getParadigm().getName().equalsIgnoreCase("noun-1b")) || pamatforma.endsWith("k") || pamatforma.endsWith("g"))
deminutīvleksēmas = null; // nepieļaujam nepareizās mijas 'ceļiņš', 'pīrāgiņš', 'druskiņa'
if (deminutīvleksēmas != null)
for (Lexeme leksēma : deminutīvleksēmas) {
Wordform variants = new Wordform(word, leksēma, ending);
variants.addAttributes(celms); // ?
variants.addAttribute(AttributeNames.i_Deminutive, "-iņ-");
variants.addAttribute(AttributeNames.i_Source,"pamazināmo formu atvasināšana");
variants.addAttribute(AttributeNames.i_SourceLemma, leksēma.getValue(AttributeNames.i_Lemma));
variants.addAttribute(AttributeNames.i_Guess, AttributeNames.v_Deminutive);
String lemma = pamatforma + "iņ" + ending.getLemmaEnding().getEnding();
lemma = recapitalize(lemma, originalWord);
variants.addAttribute(AttributeNames.i_Lemma, lemma);
rezultāts.addWordform(variants);
}
}
}
private boolean isAcceptable(Wordform variants) {
if (!enableVocative && variants.isMatchingStrong(AttributeNames.i_Case,AttributeNames.v_Vocative))
return false;
if (variants.isMatchingStrong(AttributeNames.i_NumberSpecial, AttributeNames.v_PlurareTantum) &&
!(variants.isMatchingWeak(AttributeNames.i_Number, AttributeNames.v_Plural) || variants.isMatchingWeak(AttributeNames.i_Number, AttributeNames.v_NA)))
return false;
if (variants.isMatchingStrong(AttributeNames.i_NumberSpecial, AttributeNames.v_SingulareTantum) &&
!(variants.isMatchingWeak(AttributeNames.i_Number, AttributeNames.v_Singular) || variants.isMatchingWeak(AttributeNames.i_Number, AttributeNames.v_NA)))
return false;
return true;
}
private Word guessByPrefix(String word) {
Word rezultāts = new Word(word);
if (word.contains(" ")) return rezultāts;
boolean vajadzība = false;
if (word.startsWith(this.DEBITIVE_PREFIX)) {
vajadzība = true;
word = word.substring(2);
}
for (String priedēklis : prefixes)
if (word.startsWith(priedēklis) || word.startsWith(this.SUPERLATIVE_PREFIX+priedēklis)) {
String cut_word;
if (word.startsWith(this.SUPERLATIVE_PREFIX)) {
cut_word = this.SUPERLATIVE_PREFIX+word.substring(3+priedēklis.length());
} else {
cut_word = word.substring(priedēklis.length());
}
if (vajadzība) cut_word = this.DEBITIVE_PREFIX + cut_word;
Word bezpriedēkļa = analyzeLowercase(cut_word, cut_word);
for (Wordform variants : bezpriedēkļa.wordforms)
if (variants.getEnding() != null && variants.getEnding().getParadigm() != null && variants.getEnding().getParadigm().getValue(AttributeNames.i_Konjugaacija) != null) { // Tikai no verbiem atvasinātās klases
if (priedēklis.equals(this.NEGATION_PREFIX) && (variants.isMatchingStrong(AttributeNames.i_Mood, AttributeNames.v_DebitiveQuotative)
|| variants.isMatchingStrong(AttributeNames.i_Mood, AttributeNames.v_Debitive))
|| variants.isMatchingStrong(AttributeNames.i_Noliegums, AttributeNames.v_Yes) ) {
continue; // neģenerējam ne- atvasinājumus vajadzības izteiksmei un jau noliegtiem šķirkļiem
}
if (variants.isMatchingStrong(AttributeNames.i_Degree, AttributeNames.v_Superlative) && !word.startsWith(this.SUPERLATIVE_PREFIX) ) {
continue; // neņemam tos, kur ir "vis" uzlicies aiz priedēkļa, kā nevisdomājošākais pavisdomājošākais
}
variants.setToken(word);
variants.addAttribute(AttributeNames.i_Source,"priedēkļu atvasināšana");
variants.addAttribute(AttributeNames.i_Prefix, priedēklis);
if (!priedēklis.equals(this.NEGATION_PREFIX) || !variants.isMatchingWeak(AttributeNames.i_PartOfSpeech, AttributeNames.v_Verb)) {
variants.addAttribute(AttributeNames.i_SourceLemma, variants.getValue(AttributeNames.i_Lemma));
variants.addAttribute(AttributeNames.i_Lemma,priedēklis+variants.getValue(AttributeNames.i_Lemma));
}
variants.addAttribute(AttributeNames.i_Guess, AttributeNames.v_Prefix);
variants.addAttribute(AttributeNames.i_Noliegums, priedēklis.equals(this.NEGATION_PREFIX) ? AttributeNames.v_Yes : AttributeNames.v_No);
rezultāts.wordforms.add(variants);
}
}
return rezultāts;
}
public void reanalyze(Word vārds) {
Word jaunais = analyze(vārds.getToken());
vārds.wordforms.clear();
for (Wordform vārdforma : jaunais.wordforms)
vārds.wordforms.add(vārdforma);
vārds.notifyObservers();
}
// originalWord - original capitalization
public Word guessByEnding(String word, String originalWord) {
Word rezultāts = new Word(word);
for (int i=word.length()-2; i>=0; i--) { // TODO - duma heiristika, kas vērtē tīri pēc galotņu garuma; vajag pēc statistikas
for (Ending ending : getAllEndings().matchedEndings(word))
if (ending.getEnding().length()==i) {
Paradigm p = ending.getParadigm();
if (p.isMatchingStrong(AttributeNames.i_ParadigmProperties, AttributeNames.v_HardcodedWordforms))
continue; // Hardcoded vārdgrupa minēšanai nav aktuāla
String stem;
try {
stem = ending.stem(word);
} catch (Ending.WrongEndingException e) {
throw new Error(e); // Shouldn't ever happen - matchedEndings should ensure that word contains that ending.
}
ArrayList celmi = Mijas.mijuVarianti(stem, ending.getMija(), false); //FIXME - te var būt arī propername... tikai kā tā info līdz šejienei nonāks?
for (Variants celma_variants : celmi) {
String celms = celma_variants.celms;
if (!p.allowedGuess(celms))
if (p_firstcap.matcher(originalWord).matches() && (p.getName().equalsIgnoreCase("noun-4m") ||
p.getName().equalsIgnoreCase("noun-4ma") || p.getName().equalsIgnoreCase("noun-3f"))) {
} // Ja ir īpašvārds ar -a -e galotni, tad mēģina arī vīriešu dzimtes variantus uzvārdiem
else
continue; // citos gadījumos, ja beigu burti izskatās neadekvāti tam, kas leksikonā pie paradigmas norādīts - tad neminam.
Wordform variants = new Wordform(word, null, ending);
variants.addAttribute(AttributeNames.i_Source, "minējums pēc galotnes");
variants.addAttribute(AttributeNames.i_Guess, AttributeNames.v_Ending);
// FIXME ko ar pārējiem variantiem?? un ko ja nav variantu?
Ending pamatforma = ending.getLemmaEnding();
if (pamatforma != null) {
// Izdomājam korektu lemmu
String lemma = celms + pamatforma.getEnding();
lemma = recapitalize(lemma, originalWord);
variants.addAttribute(AttributeNames.i_Lemma, lemma);
}
if (((this.guessNouns && ending.getParadigm().isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Noun) &&
(enableVocative || !variants.isMatchingStrong(AttributeNames.i_Case, AttributeNames.v_Vocative)) &&
(guessInflexibleNouns || !variants.isMatchingStrong(AttributeNames.i_Declension, AttributeNames.v_NA))
) ||
(this.guessVerbs && ending.getParadigm().isMatchingWeak(AttributeNames.i_PartOfSpeech, AttributeNames.v_Verb)) ||
(this.guessAdjectives && ending.getParadigm().isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Adjective)) ||
(this.guessParticiples && variants.isMatchingStrong(AttributeNames.i_Mood, AttributeNames.v_Participle)) ||
(this.guessNouns && this.guessInflexibleNouns && variants.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Residual))
)
&& (i > 0 || variants.isMatchingStrong(AttributeNames.i_Declension, AttributeNames.v_NA)
|| variants.isMatchingStrong(AttributeNames.i_Declension, AttributeNames.v_InflexibleGenitive)
|| variants.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Residual)
)) // ja galotnes nav, tad vai nu nelokāms lietvārds vai neatpazīstam. Lai nav verbu bezgalotņu formas minējumos, kas parasti nav pareizās.
{
if (variants.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Residual)) {
char last = celms.charAt(celms.length() - 1);
if (Character.isDigit(last)) {
variants.removeAttribute(AttributeNames.i_ResidualType); // defaultais AttributeNames.v_Foreign te neatblist
}
}
rezultāts.wordforms.add(variants);
}
}
}
if (rezultāts.isRecognized() && !enableAllGuesses) {
// FIXME - šo te vajag aizstāt ar kādu heiristiku, kas atrastu, piemēram, ticamākos lietvārdvariantus, ticamākos īpašībasvārdagadījumus utml.
if (!word.endsWith("o")) // mēdz būt nelokāmi -o lietvārdi - bez galotnes, pretstatā dažām -o formām
break;
}
}
return rezultāts;
}
/**
* Performs morphological analysis, assuming that we know explicitly that the form is a lemma
* @param word
* @return
*/
public Word analyzeLemma(String word) {
Word result = new Word(word);
Word varianti = analyze(word);
for (Wordform vārdforma : varianti.wordforms) {
Ending ending = vārdforma.getEnding();
if ( (ending != null && ending.getLemmaEnding() == ending) ||
(vārdforma.getValue(AttributeNames.i_Lemma).equalsIgnoreCase(word) && (
vārdforma.isMatchingStrong(AttributeNames.i_NumberSpecial, AttributeNames.v_PlurareTantum) ||
vārdforma.isMatchingStrong(AttributeNames.i_EntryProperties, AttributeNames.v_Plural) ) ) )
result.addWordform(vārdforma);
}
return result;
}
/**
* Provides a list of paradigms that might be suitable for a given lemma
* The guessing restrictions for stem final letters and closed paradigms will be obeyed except for exceptions listed in lexicon
* @param lemma the lemma that should be reviewed. Plural forms will be treated as possibly valid for the case of plurare tantum
* @return a list of Paradigm objects which are possible for this case.
*/
public List suitableParadigms(String lemma) {
List result = new ArrayList<>();
Word lexicon_options = this.analyze(lemma);
Word all_options = this.guessByEnding(lemma.toLowerCase().trim(), lemma); // All analysis options as a starting point
for (Wordform wf : lexicon_options.wordforms) {
all_options.addWordform(wf); // form a joint list of both known words from lexicon and also pure guessing
}
AttributeValues pluraretantum = new AttributeValues();
pluraretantum.addAttribute(AttributeNames.i_PartOfSpeech, AttributeNames.v_Noun);
pluraretantum.addAttribute(AttributeNames.i_Case, AttributeNames.v_Nominative);
pluraretantum.addAttribute(AttributeNames.i_Number, AttributeNames.v_Plural);
for (Wordform option : all_options.wordforms) {
Ending ending = option.getEnding();
if ((ending != null && ending.getLemmaEnding() == ending) || option.isMatchingWeak(pluraretantum)) {
result.add(ending.getParadigm());
}
}
// sort list according to statistical frequency, and remove duplicates
Set result_set = new TreeSet(new Comparator() { //comparator to eliminate duplicates
@Override
public int compare(Paradigm a, Paradigm b) {
return a.getID() - b.getID();
}
});
result_set.addAll(result);
result = new ArrayList<>(result_set);
Collections.sort(result, new ParadigmFrequencyComparator()); //comparator for statistical frequency
Collections.reverse(result); // We want the list in order of descending frequency
return result;
}
public void setCacheSize (int maxSize) {
wordCache.setSize(maxSize);
}
public void clearCache () {
wordCache.clear();
}
public ArrayList generateInflections(String lemma) {
return generateInflections(lemma, false);
}
public ArrayList generateInflections(String lemma, boolean nouns_only) {
return generateInflections(lemma, nouns_only, new AttributeValues());
}
public ArrayList generateInflections(String lemma, boolean nouns_only, AttributeValues filter) {
//Vispirms, pārbaudam specgadījumu - dubultuzvārdus
if (p_doublesurname.matcher(lemma).matches()) {
int hyphen = lemma.indexOf("-");
AttributeValues part_filter = new AttributeValues(filter); // relax filter conditions for the first part, as it can have different endings than the whole compound surname
part_filter.removeAttribute(AttributeNames.i_Lemma);
ArrayList inflections2 = generateInflections(lemma.substring(hyphen+1, lemma.length()), nouns_only, part_filter);
part_filter.removeAttribute(AttributeNames.i_Declension);
part_filter.removeAttribute(AttributeNames.i_ParadigmID);
if (part_filter.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Residual)) {
part_filter.removeAttribute(AttributeNames.i_PartOfSpeech);
part_filter.removeAttribute(AttributeNames.i_ResidualType);
}
ArrayList inflections1 = generateInflections(lemma.substring(0, hyphen), nouns_only, part_filter);
if ( (inflections1.size()>1 && inflections2.size()>1)) // Ja sanāk nelokāms kautkas, tad nemēģinam taisīt kā dubultuzvārdu - tie ir ļoti reti un tas salauztu vairāk nekā iegūtu
return mergeInflections(inflections1, inflections2, "-"); // TODO - unittestos ir aizkomentēti piemēri Pavļuta-Deslandes un Freiberga-Žverelo, kas šo testētu
}
Word possibilities = this.analyze(lemma);
filterInflectionPossibilities(nouns_only, filter, possibilities.wordforms);
ArrayList result = generateInflections_TryLemmas(lemma, possibilities);
if (result != null) filterInflectionPossibilities(nouns_only, filter, result);
// If result is null, it means that all the suggested lemma can be (and was) generated from another lemma - i.e. "Dīcis" from "dīkt"; but not from an existing lexicon lemma
// We assume that a true lemma was passed by the caller, and we need to generate/guess the wordforms as if the lemma was correct.
if ((result == null || result.size()==0) && this.enableGuessing) {
possibilities = this.guessByEnding(lemma.toLowerCase(), lemma);
filterInflectionPossibilities(nouns_only, filter, possibilities.wordforms);
result = generateInflections_TryLemmas(lemma, possibilities);
}
// If guessing didn't work, return an empty list
if (result == null)
result = new ArrayList();
return result;
}
// Ņemam divas locījumu kopas un apvienojam vienā .... pašreiz pielietojums tikai dubultuzvārdiem, pēc tam varbūt vēl kaut kur (frāzes?)
private ArrayList mergeInflections(
ArrayList inflections1, ArrayList inflections2,
String concatenator) {
ArrayList result = new ArrayList();
if (inflections1.size() <= 1) {
// Specgadījums - pirmais ir nelokāms
String fixedtoken = "???";
String fixedlemma = "???";
if (inflections1.size() > 0) {
fixedtoken = inflections1.get(0).getToken();
fixedlemma = inflections1.get(0).getValue(AttributeNames.i_Lemma);
}
for (Wordform otrā : inflections2) {
Wordform apvienojums = (Wordform) otrā.clone(); // Pamatinfo no otrās daļas, jo tā itkā ir gramatiski dominējoša
apvienojums.setToken(fixedtoken + concatenator + apvienojums.getToken());
apvienojums.addAttribute(AttributeNames.i_Lemma, fixedlemma + concatenator + apvienojums.getValue(AttributeNames.i_Lemma));
// TODO - vēl kautkas?
result.add(apvienojums);
}
} else if (inflections2.size() <= 1) {
// Specgadījums - otrais ir nelokāms
String fixedtoken = "???";
String fixedlemma = "???";
if (inflections2.size() > 0) {
fixedtoken = inflections2.get(0).getToken();
fixedlemma = inflections2.get(0).getValue(AttributeNames.i_Lemma);
}
for (Wordform pirmā : inflections1) {
Wordform apvienojums = (Wordform) pirmā.clone(); // Pamatinfo no otrās daļas, jo tā itkā ir gramatiski dominējoša
apvienojums.setToken(apvienojums.getToken() + concatenator + fixedtoken);
apvienojums.addAttribute(AttributeNames.i_Lemma, apvienojums.getValue(AttributeNames.i_Lemma) + concatenator + fixedlemma);
// TODO - vēl kautkas?
result.add(apvienojums);
}
} else {
// Normālais gadījums, kad vajag prātīgi apvienot
for (Wordform pirmā : inflections1) {
AttributeValues filter = new AttributeValues();
// Pieņemam, ka te tikai lietvārdi apgrozīsies
filter.addAttribute(AttributeNames.i_Case, pirmā.getValue(AttributeNames.i_Case));
filter.addAttribute(AttributeNames.i_Number, pirmā.getValue(AttributeNames.i_Number));
ArrayList possibilities = (ArrayList) inflections2.clone();
filterInflectionPossibilities(true, filter, possibilities);
if (possibilities.size() == 0) {
// Debuginfo
// System.err.println("Problēma ar dubultuzvārdu locīšanu - nesanāca dabūt atbilstošu 'pārīti' šim te pirmās daļas locījumam");
// pirmā.describe(new PrintWriter(System.err));
// System.err.println(".. no šīs te kopas otrās daļas locījumu");
// for (Wordform otrā : inflections2) {
// otrā.describe(new PrintWriter(System.err));
// System.err.println(" --");
// }
} else {
if ((!pirmā.isMatchingStrong(AttributeNames.i_Case, AttributeNames.v_Vocative) && possibilities.size() > 1) || possibilities.size() > 2) {
// Debuginfo
// System.err.println("Problēma ar dubultuzvārdu locīšanu - par daudz atbilstošu 'pārīšu' šim te pirmās daļas locījumam");
// pirmā.describe(new PrintWriter(System.err));
// System.err.println(".. no šīs te kopas otrās daļas locījumu");
// for (Wordform otrā : inflections2) {
// otrā.describe(new PrintWriter(System.err));
// System.err.println(" --");
// }
}
Wordform apvienojums = (Wordform) possibilities.get(0).clone(); // Pamatinfo no otrās daļas, jo tā itkā ir gramatiski dominējoša
apvienojums.setToken(pirmā.getToken() + concatenator + apvienojums.getToken());
apvienojums.addAttribute(AttributeNames.i_Lemma, pirmā.getValue(AttributeNames.i_Lemma) + concatenator + apvienojums.getValue(AttributeNames.i_Lemma));
// TODO - vēl kautkas?
result.add(apvienojums);
}
}
}
return result;
}
// generate all forms if the paradigm # is known
// TODO - needs more support for extra features (fixed-genitives, etc)
public ArrayList generateInflectionsFromParadigm(String lemma, int paradigm, AttributeValues lemmaAttributes) {
Paradigm p = this.paradigmByID(paradigm);
if (p == null)
return generateInflections(lemma); // If the supplied paradigm is invalid, we ignore it
if (p.getStems() > 1) // For 1st conjugation verbs, lemma is not enough info to inflect properly
return generateInflections(lemma); // Assume that it will be in current lexicon..
Ending ending = p.getLemmaEnding(); // We expect that the lemma will be the default lemma, unless...
// if attributes list plurare tantum, then we look for plural nominative as the lemma
if ((lemmaAttributes.isMatchingStrong(AttributeNames.i_NumberSpecial, AttributeNames.v_PlurareTantum) ||
lemmaAttributes.isMatchingStrong(AttributeNames.i_EntryProperties, AttributeNames.v_Plural))
&& !ending.isMatchingWeak(AttributeNames.i_Number, AttributeNames.v_Plural)) {
// Assuming that there will be only one plural nominative entry in case of daudzskaitlinieki
AttributeValues plural_nominative = new AttributeValues();
plural_nominative.addAttribute(AttributeNames.i_Number, AttributeNames.v_Plural);
plural_nominative.addAttribute(AttributeNames.i_Case, AttributeNames.v_Nominative);
if (ending.getParadigm().isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Adjective)) {
plural_nominative.addAttribute(AttributeNames.i_Definiteness, AttributeNames.v_Indefinite);
plural_nominative.addAttribute(AttributeNames.i_Gender, AttributeNames.v_Masculine);
}
for (Ending candidate_ending : ending.getParadigm().endings) {
if (candidate_ending.isMatchingStrongOneSide(plural_nominative)
&& lemma.endsWith(candidate_ending.getEnding())) {
ending = candidate_ending;
}
}
}
// if attributes list feminine gender, then we look for feminine singular nominative as the lemma
if (lemmaAttributes.isMatchingStrong(AttributeNames.i_Gender, AttributeNames.v_Feminine)
&& !ending.isMatchingWeak(AttributeNames.i_Gender, AttributeNames.v_Feminine)) {
// Assuming that there will be only one fitting form
AttributeValues feminine_lemma = new AttributeValues();
feminine_lemma.addAttribute(AttributeNames.i_Number, AttributeNames.v_Singular);
feminine_lemma.addAttribute(AttributeNames.i_Case, AttributeNames.v_Nominative);
feminine_lemma.addAttribute(AttributeNames.i_Gender, AttributeNames.v_Feminine);
if (ending.getParadigm().isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Adjective)) {
feminine_lemma.addAttribute(AttributeNames.i_Definiteness, AttributeNames.v_Indefinite);
}
for (Ending candidate_ending : ending.getParadigm().endings) {
if (candidate_ending.isMatchingStrong(feminine_lemma)
&& lemma.endsWith(candidate_ending.getEnding())) {
ending = candidate_ending;
}
}
}
// did not found appropriate ending
if (ending == null || !lemma.endsWith(ending.getEnding())) {
System.err.printf("Attempted to generate inflections for lemma '%s' at paradigm '%d'; failed because of mismatched ending\n", lemma, paradigm);
}
if (ending == null) return new ArrayList<>();
Lexeme l = this.createLexeme(lemma, ending, "temp");
if (l == null) { // Couldn't create the lexeme - the word wasn't compatible with the supplied paradigm
return new ArrayList();
}
l.addAttributes(lemmaAttributes);
// Workaround priekš tā, ka vajag leksēmas ID lai atrastu atbilstošās papildformas
ArrayList matching_lexemes = p.getLexemesByStem().get(0).get(l.getStem(0));
for (Lexeme l2: matching_lexemes) {
// FIXME - salūzīs, ja būs vairākas homoformas vienā paradigmā
if (l != l2) {
l.setID(l2.getID());
}
}
ArrayList result = generateInflections(l, lemma);
filterInflectionPossibilities(false, null, result);
p.removeLexeme(l); // To not pollute the in-memory lexicon - FIXME - this temporary lexeme does have temporary pollution which could have multithreading race conditions
return result;
}
// generate all forms if the paradigm # is known
public ArrayList generateInflectionsFromParadigm(String lemma, int paradigm) {
return generateInflectionsFromParadigm(lemma, paradigm, new AttributeValues());
}
public ArrayList generateInflectionsFromParadigm(String lemma, int paradigm, String stem1, String stem2, String stem3){
return generateInflectionsFromParadigm(lemma, paradigm, stem1, stem2, stem3, new AttributeValues());
}
// generate all forms if the paradigm # and also the three lemmas (for 1st conjugation) are known
// FIXME - DRY, repeats previous function
public ArrayList generateInflectionsFromParadigm(String lemma, int paradigm, String stem1, String stem2, String stem3, AttributeValues lemmaAttributes) {
Paradigm p = this.paradigmByID(paradigm);
if (p == null)
return generateInflections(lemma); // If the supplied paradigm is invalid, we ignore it
// if (p.getStems() == 1) // If it's not 1st conjugation verb, perform as if we didn't know the stems
// return generateInflectionsFromParadigm(lemma, paradigm, lemmaAttributes);
if (!lemma.endsWith(p.getLemmaEnding().getEnding())) {
//FIXME - should check for plural nouns, etc
}
Ending e = p.getLemmaEnding();
String normallemma = stem1 + e.getEnding();
Lexeme l = this.createLexeme(normallemma, e, "temp");
l.addAttribute(AttributeNames.i_Lemma, lemma);
l.addAttributes(lemmaAttributes);
if (l == null) { // Couldn't create the lexeme - the word wasn't compatible with the supplied paradigm
return new ArrayList();
}
l.setStem(0, stem1);
if (p.getStems()>1) {
l.setStem(1, stem2);
l.setStem(2, stem3);
}
// Workaround priekš tā, ka vajag leksēmas ID lai atrastu atbilstošās papildformas
ArrayList matching_lexemes = p.getLexemesByStem().get(0).get(l.getStem(0));
for (Lexeme l2: matching_lexemes) {
// FIXME - salūzīs, ja būs vairākas homoformas vienā paradigmā
if (l != l2) {
l.setID(l2.getID());
}
}
ArrayList result = generateInflections(l, lemma);
filterInflectionPossibilities(false, null, result);
p.removeLexeme(l); // To not pollute the in-memory lexicon
return result;
}
// removes possibilities that aren't nouns/substantivised adjectives, and don't match the filter
public void filterInflectionPossibilities(boolean nouns_only, AttributeValues filter, ArrayList possibilities) {
ArrayList unsuitable = new ArrayList();
for (Wordform wf : possibilities) {
// "nouns_only" filter and its exceptions
boolean suitable = ! nouns_only; // if nouns_only, then we want to test for partofspeech, if not, then okay by default
if (wf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Noun)) suitable = true;
if (wf.isMatchingStrong(AttributeNames.i_Conversion, AttributeNames.v_Noun)) suitable = true;
if (wf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Adjective) &&
wf.isMatchingStrong(AttributeNames.i_Definiteness, AttributeNames.v_Definite)) suitable = true;
if (wf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Residual) &&
wf.isMatchingStrong(AttributeNames.i_ResidualType, AttributeNames.v_Foreign)) suitable = true; // visādi Vadim, Kirill utml
// ------ end of nouns_only exceptions
/* Now implemented with flag 'Morfotabulas attēlošana'
if (wf.isMatchingStrong(AttributeNames.i_ProperNounType, AttributeNames.v_Toponym) &&
wf.isMatchingStrong(AttributeNames.i_Number, AttributeNames.v_Plural) &&
!wf.isMatchingStrong(AttributeNames.i_NumberSpecial, AttributeNames.v_PlurareTantum)
) suitable = false; // Do not generate plural forms of singular toponyms
*/
if (wf.isMatchingStrong(AttributeNames.i_EntryProperties, AttributeNames.v_EntryComparative) &&
wf.isMatchingStrong(AttributeNames.i_Degree, AttributeNames.v_Positive)
) suitable = false; // Do not generate positive forms of comparative/superlative adjectives
if (!wf.isMatchingWeak(filter) &&
!wf.isMatchingStrong(AttributeNames.i_ResidualType, AttributeNames.v_Foreign) &&
!wf.isMatchingStrong(AttributeNames.i_Declension, AttributeNames.v_NA)
) suitable = false; //filter overrides everything except inflexible stuff
if (!suitable) unsuitable.add(wf);
}
possibilities.removeAll(unsuitable);
}
// TODO - needs refactoring and unittests
// Attempts to find the "proper lemma" out of analysis options provided, possibly making a new lexeme if needed, and then generate the inflections from that lemma
public ArrayList generateInflections_TryLemmas(String lemma, Word w) {
for (Wordform wf : w.wordforms) {
if (wf.isMatchingStrong(AttributeNames.i_Case, AttributeNames.v_Vocative))
continue; // Vocatives often match lemmas and are false positives
// Pamēģinam katru no analīzes variantiem, vai viņš ir pamatforma (atbilst vajadzīgajai lemmai)
Lexeme lex = wf.lexeme;
// The regular case where lemmas must be "normal"
if (wf.getValue(AttributeNames.i_Lemma).equalsIgnoreCase(lemma) ||
lemma.equalsIgnoreCase(wf.getValue(AttributeNames.i_LemmaParadigm)) ) {
if (lex == null || !lex.getValue(AttributeNames.i_Lemma).equalsIgnoreCase(lemma)) { // NB! this is lex.lemma not wf.lemma that's checked earlier
// Ja nav pareizā leksēma (atvasināšana vai minēšana) tad uztaisam leksēmu
Ending ending = wf.getEnding();
if (wf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Adverb))
ending = this.paradigmByName("adverb").getLemmaEnding();
// FIXME - es te iekodēju izņēmumgadījumu jo nevaru saprast kā pareizāk darīt vispārīgi
if (lemma.endsWith("šana") && wf.getEnding().getParadigm().isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Verb)) {
ending = this.paradigmByName("noun-4f").getLemmaEnding();
}
lex = this.createLexeme(lemma, ending, "generateInflectionsFromParadigm"); // Temporary lexeme
if (lex.getValue(AttributeNames.i_PartOfSpeech) == null)
lex.addAttribute(AttributeNames.i_PartOfSpeech, wf.getValue(AttributeNames.i_PartOfSpeech)); // Hardcoded vārdšķirai lai ir POS - saīsinājumi utml
if (p_firstcap.matcher(lemma).matches())
lex.addAttribute(AttributeNames.i_NounType, AttributeNames.v_ProperNoun); //FIXME - hack personvārdu 'Valdis' utml locīšanai
if (wf.getEnding().getParadigm().getStems() > 1 && wf.lexeme != null && wf.getValue(AttributeNames.i_Prefix) != null) { // Priedēkļu atvasināšanai priedēklis jāpieliek arī pārējiem celmiem
lex.setStem(1, wf.getValue(AttributeNames.i_Prefix) + wf.lexeme.getStem(1));
lex.setStem(2, wf.getValue(AttributeNames.i_Prefix) + wf.lexeme.getStem(2));
}
}
ArrayList result = generateInflections(lex, lemma);
if (lex.isMatchingStrong(AttributeNames.i_Source, "generateInflectionsFromParadigm"))
lex.getParadigm().removeLexeme(lex); // removed temporary lexeme
return result;
}
if (lemma.startsWith(this.NEGATION_PREFIX) && lemma.equalsIgnoreCase(this.NEGATION_PREFIX + wf.getValue(AttributeNames.i_Lemma)) && lex != null) {
// inflection of negated verbs/participles
return generateInflections(lex, lemma);
}
// The case for nominalized adjectives such as adjective-derived surnames
if ( wf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Adjective) && (
(lemma.toLowerCase().endsWith("ais") && lemma.equalsIgnoreCase(wf.getValue(AttributeNames.i_Lemma).substring(0, wf.getValue(AttributeNames.i_Lemma).length()-1)+"ais")) ||
(lemma.toLowerCase().endsWith("ā") && wf.getValue(AttributeNames.i_Lemma).equalsIgnoreCase(lemma.substring(0, lemma.length()-1)+"s") && wf.isMatchingStrong(AttributeNames.i_Gender, AttributeNames.v_Feminine)) ) ) {
// Exception for adjective-based surnames "Lielais", "Platais" etc
if ((lex == null && lemma.toLowerCase().endsWith("ais")) || (lex != null && !lex.getValue(AttributeNames.i_Lemma).equalsIgnoreCase(lemma))) {
lex = this.createLexeme(lemma, wf.getEnding(), "generateInflectionsFromParadigm");
if (p_firstcap.matcher(lemma).matches())
lex.addAttribute(AttributeNames.i_NounType, AttributeNames.v_ProperNoun); //FIXME - hack personvārdu 'Valdis' utml locīšanai
}
if (lex == null) continue;
ArrayList result = new ArrayList();
for (Wordform wf2 : generateInflections(lex, lemma)) {
if (wf2.isMatchingStrong(AttributeNames.i_Definiteness, AttributeNames.v_Definite) && wf2.isMatchingStrong(AttributeNames.i_Degree, AttributeNames.v_Positive) && wf2.isMatchingWeak(AttributeNames.i_Gender, wf.getValue(AttributeNames.i_Gender))) {
result.add(wf2);
}
}
if (lex.isMatchingStrong(AttributeNames.i_Source, "generateInflectionsFromParadigm"))
lex.getParadigm().removeLexeme(lex);
return result;
}
}
return null;
}
public ArrayList generateInflections(Lexeme lexeme, String lemma)
{
String trešāSakne = null, vārds;
//Vārds rezultāts = new Vārds(leksēma.īpašības.Īpašība(IpasibuNosaukumi.i_Pamatforma));
ArrayList inflections = new ArrayList(1);
//priekš 1. konj nākotnes mijas nepieciešams zināt 3. sakni
if (lexeme.getParadigm().getStems() == 3) {
trešāSakne = lexeme.getStem(2);
}
boolean noliegums = lemma.equalsIgnoreCase(this.NEGATION_PREFIX+lexeme.getValue(AttributeNames.i_Lemma));
for (Ending ending : lexeme.getParadigm().endings){
if ( ending.getValue(AttributeNames.i_PartOfSpeech)==null ||
ending.getValue(AttributeNames.i_PartOfSpeech).equals(lexeme.getValue(AttributeNames.i_PartOfSpeech)) ||
lexeme.getValue(AttributeNames.i_PartOfSpeech) == null) {
boolean vispārākāPak = ending.isMatchingStrong(AttributeNames.i_Definiteness, AttributeNames.v_Definite) ||
ending.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Adverb);
boolean properName = lexeme.isMatchingStrong(AttributeNames.i_NounType, AttributeNames.v_ProperNoun);
String pirmsmijascelms = lexeme.getStem(ending.stemID-1);
if (noliegums) {
pirmsmijascelms = this.NEGATION_PREFIX + pirmsmijascelms;
}
ArrayList celmi = Mijas.MijasLocīšanai(pirmsmijascelms, ending.getMija(), trešāSakne, vispārākāPak, properName);
for (Variants celms : celmi){
vārds = celms.celms + ending.getEnding();
vārds = recapitalize(vārds, lemma);
Wordform locījums = new Wordform(vārds, lexeme, ending);
locījums.addAttributes(celms);
boolean validOption = locījums.isMatchingWeak(AttributeNames.i_Generate, AttributeNames.v_Yes);
if (locījums.isMatchingStrong(AttributeNames.i_NumberSpecial, AttributeNames.v_PlurareTantum) && locījums.isMatchingStrong(AttributeNames.i_Number, AttributeNames.v_Singular)) validOption = false;
if (locījums.isMatchingStrong(AttributeNames.i_NumberSpecial, AttributeNames.v_SingulareTantum) && locījums.isMatchingStrong(AttributeNames.i_Number, AttributeNames.v_Plural)) validOption = false;
if (GenerationBlacklist.blacklist(locījums)) validOption = false;
if (noliegums) locījums.addAttribute(AttributeNames.i_Noliegums, AttributeNames.v_Yes);
if ((locījums.isMatchingStrong(AttributeNames.i_Noliegums, AttributeNames.v_Yes) ||
lexeme.getStem(0).equalsIgnoreCase("vajadzē")) &&
(locījums.isMatchingStrong(AttributeNames.i_Mood, AttributeNames.v_DebitiveQuotative)
|| locījums.isMatchingStrong(AttributeNames.i_Mood, AttributeNames.v_Debitive))) validOption = false;
// Īpašības vārdi ar sieviešu dzimti bet vīriešu galotnēm - ālava / ālavs, tāpat arī skaitļa vārdu novecojošās formas 'tūkstošām'
if ((locījums.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Adjective) ||
locījums.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Numeral) )&&
lexeme.isMatchingStrong(AttributeNames.i_Gender, AttributeNames.v_Feminine) &&
ending.isMatchingStrong(AttributeNames.i_Gender, AttributeNames.v_Masculine)) validOption = false;
if (validOption) inflections.add(locījums);
}
}
}
if (lexeme.getParadigm().isMatchingStrong(AttributeNames.i_ParadigmProperties, AttributeNames.v_OnlyHardcodedWordforms)) {
inflections = new ArrayList(1); // Šai gadījumā mēs nepieliekam "galotņu vārdformu nemaz
}
// Pārbaudam, vai šai lemmai nav kāds hardcoded formas override (piemēram, kā formai viņš *ej -> viņš iet)
Collection hc_forms = this.hardcodedForms.get(lexeme.getID());
// if (hc_forms.isEmpty() && lemma.startsWith(this.NEGATION_PREFIX) && (lemma.endsWith("t") || lemma.endsWith("ties"))) {
// hc_forms = this.hardcodedForms.get(lemma.substring(2));
// }
for (Lexeme formLexeme : hc_forms) {
Ending ending = formLexeme.getParadigm().getLemmaEnding();
Wordform hardcoded = new Wordform(formLexeme.getStem(0), formLexeme, ending);
if (!hardcoded.isMatchingWeak(AttributeNames.i_Generate, AttributeNames.v_Yes))
continue;
if (hardcoded.isMatchingStrong(AttributeNames.i_Noliegums, AttributeNames.v_Yes) && !hardcoded.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Pronoun) && !lemma.startsWith(this.NEGATION_PREFIX))
continue;
if (hardcoded.isMatchingStrong(AttributeNames.i_Noliegums, AttributeNames.v_No) && !hardcoded.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Pronoun) && lemma.startsWith(this.NEGATION_PREFIX))
continue;
if (!hardcoded.isMatchingStrong(AttributeNames.i_ExtraForm, AttributeNames.v_Yes)) {
Wordform override = null;
for (Wordform form : inflections) { // pārbaudam, vai kādu no esošajiem locījumiem nevajag izmest, jo šis hardcoded variants to aizvieto
if (form.isMatchingWeak(formLexeme)) {
override = form;
}
}
if (override != null) {
inflections.remove(override);
}
}
inflections.add(hardcoded);
}
// For verbs, generate also negated forms
if (!noliegums && lexeme.getParadigm().isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Verb) && !lexeme.isMatchingStrong(AttributeNames.i_Noliegums, AttributeNames.v_Yes)) {
ArrayList negated_inflections = generateInflections(lexeme,this.NEGATION_PREFIX+lexeme.getValue(AttributeNames.i_Lemma));
inflections.addAll(negated_inflections);
}
return inflections;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy