lv.semti.morphology.webservice.InflectResource Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of webservices Show documentation
Show all versions of webservices Show documentation
Webservice API for Tēzaurs.lv and other ailab.lv Latvian computational linguistic tools
/*******************************************************************************
* Copyright 2012, 2013, 2014 Institute of Mathematics and Computer Science, University of Latvia
* Author: Pēteris Paikens
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*******************************************************************************/
package lv.semti.morphology.webservice;
import java.io.*;
import java.net.URLDecoder;
import java.util.*;
import lv.semti.morphology.analyzer.Analyzer;
import lv.semti.morphology.analyzer.Splitting;
import lv.semti.morphology.analyzer.Word;
import lv.semti.morphology.analyzer.Wordform;
import lv.semti.morphology.attributes.AttributeNames;
import lv.semti.morphology.attributes.AttributeValues;
import lv.semti.morphology.lexicon.Paradigm;
import org.restlet.data.MediaType;
import org.restlet.representation.Representation;
import org.restlet.representation.StringRepresentation;
import org.restlet.resource.Get;
import org.restlet.resource.ServerResource;
public class InflectResource extends ServerResource {
@Get
public Representation retrieve() {
getResponse().setAccessControlAllowOrigin("*");
String query = (String) getRequest().getAttributes().get("query");
String language = (String) getRequest().getAttributes().get("language");
String inflmisc = getQuery().getValues("inflmisc");
List> processedtokens = inflect(query, getQuery().getValues("paradigm"), getQuery().getValues("guess"),
getQuery().getValues("stem1"), getQuery().getValues("stem2"), getQuery().getValues("stem3"), decodeInflMisc(inflmisc));
String format = (String) getRequest().getAttributes().get("format");
if (format == null) format = "json";
if (format.equalsIgnoreCase("xml")) {
StringWriter s = new StringWriter();
try {
s.write("\n");
for (Collection token : processedtokens) {
s.write("\n");
for (Wordform wf : token) wf.toXML(s);
s.write(" \n");
}
s.write(" \n");
} catch (IOException e) { e.printStackTrace(); }
return new StringRepresentation(s.toString(), MediaType.APPLICATION_XML);
} else {
List tokenJSON = new LinkedList();
for (Collection token : processedtokens) {
List wordJSON = new LinkedList();
for (Wordform wf : token) {
if ("EN".equalsIgnoreCase(language))
wordJSON.add(MorphoServer.tagset.toEnglish(wf).toJSON());
else
wordJSON.add(wf.toJSON());
}
tokenJSON.add(formatJSON(wordJSON));
}
return new StringRepresentation(formatJSON(tokenJSON), MediaType.APPLICATION_JSON);
}
}
private static AttributeValues decodeInflMisc(String inflmisc) {
AttributeValues lemmaAttrs = new AttributeValues();
if (inflmisc != null) {
for (String attr : inflmisc.split(",")) {
if (attr.equalsIgnoreCase("Vīriešu_dzimte"))
lemmaAttrs.addAttribute(AttributeNames.i_Gender, AttributeNames.v_Masculine);
if (attr.equalsIgnoreCase("Sieviešu_dzimte"))
lemmaAttrs.addAttribute(AttributeNames.i_Gender, AttributeNames.v_Feminine);
if (attr.equalsIgnoreCase("Daudzskaitlis"))
lemmaAttrs.addAttribute(AttributeNames.i_NumberSpecial, AttributeNames.v_PlurareTantum);
if (attr.equalsIgnoreCase("Vienskaitlis"))
lemmaAttrs.addAttribute(AttributeNames.i_NumberSpecial, AttributeNames.v_SingulareTantum);
if (attr.equalsIgnoreCase("Noliegums"))
lemmaAttrs.addAttribute(AttributeNames.i_Noliegums, AttributeNames.v_Yes);
}
}
return lemmaAttrs;
}
public List> inflect(String query, String paradigm_param, String guess_param, String stem1, String stem2, String stem3, AttributeValues lemmaAttrs) {
try {
query = URLDecoder.decode(query, "UTF8");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
Analyzer analyzer = MorphoServer.getAnalyzer();
Paradigm paradigm = null;
if (paradigm_param != null) {
try {
int paradigmID = Integer.decode(paradigm_param);
paradigm = analyzer.paradigmByID(paradigmID);
} catch (NumberFormatException e) {
// Using paradigm names
if (paradigm_param.endsWith("ltg")) {
analyzer = MorphoServer.getLatgalian_analyzer();
}
paradigm = analyzer.paradigmByName(paradigm_param);
}
if (paradigm == null) {
System.err.printf("Could not find paradigm '%s'\n", paradigm_param);
}
}
boolean guess = true;
if ("false".equalsIgnoreCase(guess_param)) {
guess = false;
}
analyzer.enableGuessing = guess;
analyzer.enableVocative = true;
analyzer.guessVerbs = false;
analyzer.guessParticiples = false;
analyzer.guessAdjectives = false;
analyzer.guessInflexibleNouns = guess;
analyzer.enableAllGuesses = guess;
LinkedList showAttrs = new LinkedList();
//FIXME - this set is not appropriate for inflecting verbs and others...
showAttrs.add(AttributeNames.i_Word); showAttrs.add(AttributeNames.i_PartOfSpeech); showAttrs.add(AttributeNames.i_Derivative);
//nouns
showAttrs.add(AttributeNames.i_Case); showAttrs.add(AttributeNames.i_Number); showAttrs.add(AttributeNames.i_Gender); showAttrs.add(AttributeNames.i_Declension);
//verbs/participles
showAttrs.add(AttributeNames.i_Person); showAttrs.add(AttributeNames.i_Mood); showAttrs.add(AttributeNames.i_Tense); showAttrs.add(AttributeNames.i_Voice); showAttrs.add(AttributeNames.i_Konjugaacija); showAttrs.add(AttributeNames.i_Noliegums);
//adjectives
showAttrs.add(AttributeNames.i_Degree); showAttrs.add(AttributeNames.i_Definiteness);
// usage restrictions are necessary for distinguishing which forms to use/show
showAttrs.add(AttributeNames.i_Frequency); showAttrs.add(AttributeNames.i_Usage); showAttrs.add(AttributeNames.i_Normative);
List tokens = Splitting.tokenize(analyzer, query);
LinkedList> processedTokens = new LinkedList<>();
for (Word word : tokens) {
List formas;
if (paradigm == null) { // no specific options passed or not found
// checking for special case of common-gender nouns of 4th/5th declension - 'paziņa', 'auša', 'bende'
boolean male_4th = false;
boolean female_4th = false;
boolean male_5th = false;
boolean female_5th = false;
for (Wordform analysis_case : word.wordforms) {
if (analysis_case.isMatchingStrong(AttributeNames.i_ParadigmID, "7")) female_4th = true;
if (analysis_case.isMatchingStrong(AttributeNames.i_ParadigmID, "8")) male_4th = true;
if (analysis_case.isMatchingStrong(AttributeNames.i_ParadigmID, "9")) female_5th = true;
if (analysis_case.isMatchingStrong(AttributeNames.i_ParadigmID, "10")) male_5th = true;
}
// word.describe(new PrintWriter(System.out));
// System.out.printf("Inflectresource: vārds %s. male4:%b female4:%b male5:%b female5:%b\n", word.getToken(), male_4th, female_4th, male_5th, female_5th);
if (male_4th && female_4th) { // if so, then build both inflections and merge their forms.
formas = analyzer.generateInflectionsFromParadigm(word.getToken(), 7, lemmaAttrs);
formas.addAll(analyzer.generateInflectionsFromParadigm(word.getToken(), 8, lemmaAttrs));
} else if (male_5th && female_5th) {
formas = analyzer.generateInflectionsFromParadigm(word.getToken(), 9, lemmaAttrs);
formas.addAll(analyzer.generateInflectionsFromParadigm(word.getToken(), 10, lemmaAttrs));
} else
formas = analyzer.generateInflections(word.getToken()); // normal case of building just from the token
} else {
// if ((paradigm.getStems()>1) && stem1 != null && stem2 != null && stem3 != null) {
// // For 1st conjugation verbs, if all three stems are passed, then try to use them for inflection
if ( (stem1 != null && !stem1.equalsIgnoreCase("")) || (stem2 != null && stem2.equalsIgnoreCase("")) || (stem3 != null && stem3.equalsIgnoreCase(""))) {
formas = multistem_generate(analyzer, word.getToken(), paradigm.getID(), stem1, stem2, stem3);
} else {
// if a specific paradigm is passed, inflect according to that
formas = analyzer.generateInflectionsFromParadigm(word.getToken(), paradigm.getID(), lemmaAttrs);
}
}
for (Wordform wf : formas) {
wf.filterAttributes(showAttrs);
wf.lexeme = null; // so that identical forms would compare as equal
}
processedTokens.add(new LinkedHashSet<>(formas));
}
analyzer.defaultSettings();
return processedTokens;
}
/**
* Generates wordforms with the assumption that "stemX" may contain multiple alternate stem options separated by a comma
* @param token
* @param paradigmID
* @param stem1
* @param stem2
* @param stem3
* @return
*/
private List multistem_generate(Analyzer analyzer, String token, Integer paradigmID, String stem1, String stem2, String stem3) {
if (stem2 != null && stem2.contains(",")) {
List formas = new LinkedList<>();
String[] stems2 = stem2.split(",");
String[] stems3 = stem3.split(",");
for (int i=0; i formas = new LinkedList<>();
for (String stem : stem3.split(",")) {
formas.addAll(multistem_generate(analyzer, token, paradigmID, stem1, stem2, stem.trim()));
}
return formas;
}
// System.out.printf("%s\t%s\t%s\n", stem1, stem2, stem3);
return analyzer.generateInflectionsFromParadigm(token, paradigmID, stem1, stem2, stem3);
}
private String formatJSON(Collection tags) {
Iterator i = tags.iterator();
String out = "[";
while (i.hasNext()) {
out += i.next();
if (i.hasNext()) out += ",\n";
}
out += "]";
return out;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy