lv.lumii.morphotagger.MorphoPipe Maven / Gradle / Ivy
/*******************************************************************************
* Copyright 2012,2013,2014 Institute of Mathematics and Computer Science, University of Latvia
* Author: Pēteris Paikens
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*******************************************************************************/
package lv.lumii.morphotagger;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map.Entry;
import lv.semti.morphology.analyzer.Splitting;
import lv.semti.morphology.analyzer.Word;
import lv.semti.morphology.analyzer.Wordform;
import lv.semti.morphology.attributes.AttributeNames;
import lv.semti.morphology.attributes.AttributeValues;
import org.json.simple.JSONValue;
import edu.stanford.nlp.ie.ner.CMMClassifier;
import edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.ExtraColumnAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.LVMorphologyAnalysis;
import edu.stanford.nlp.ling.CoreAnnotations.ParentAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Datum;
import edu.stanford.nlp.sequences.LVMorphologyReaderAndWriter;
// Copied/pasted/mangled from transliteration webservices java project
public class MorphoPipe {
private enum inputTypes {SENTENCE, PARAGRAPH, VERT, CONLL, JSON};
private enum outputTypes {JSON, TAB, VERT, MOSES, CONLL_X, XML, VISL_CG, lemmatizedText, lowercasedText, analyzerOptions};
private static String eol = System.getProperty("line.separator");
private static String field_separator = "\t";
private static String token_separator = eol;
private static boolean mini_tag = false;
private static boolean features = false;
private static boolean LETAfeatures = false;
private static inputTypes inputType = inputTypes.PARAGRAPH;
private static outputTypes outputType = outputTypes.CONLL_X;
//private static int sentencelengthcap = Splitting.DEFAULT_SENTENCE_LENGTH_CAP;
private static int sentencelengthcap = 100;
private static boolean saveColumns = false;
private static boolean keepTags = false;
private static boolean saveCase = false; // for lemmatized text output format
private static boolean outputSeparators = false; // for sentences, for paragraphs
private static boolean whitespaceMarker = false;
private static boolean stopOnEmpty = true; // quit on empty line
private static String morphoClassifierLocation = "models/lv-morpho-model.ser.gz"; //FIXME - make it configurable
public static void main(String[] args) throws Exception {
for (int i=0; i. Any XML-style tags are echoed as-is. \n\t\tNB! sentences are retokenized, the number of tokens may be different.");
System.out.println("\t-conll-in : CONLL shared task data format - one line per token, with tab-delimited columns, sentences separated by blank lines.");
System.out.println("\t-json-in : one line per sentence, each line contains a single json array of strings-tokens.");
System.out.println("\nOutput formats");
System.out.println("\tDefault : JSON. Each sentence is returned as a list of dicts, each dict contains elements 'Word', 'Tag' and 'Lemma'.");
System.out.println("\t-tab : one response line for each query line; tab-separated lists of word, tag and lemma.");
System.out.println("\t-vert : one response line for each token; tab-separated lists of word, tag and lemma.");
System.out.println("\t-moses : one response line for each token; pipe-separated lists of word, tag and lemma.");
System.out.println("\t-conll-x : CONLL-X shared task data format - one line per token, with tab-delimited columns, sentences separated by blank lines.");
System.out.println("\t-xml : one xml word per line");
System.out.println("\t-visl-cg : output format for VISL constraint grammar tool");
System.out.println("\t-lemmatized-text : output lowercase lemmatized text, each sentence in new row, tokens seperated by single space");
System.out.println("\t-lowercased-text : output lowercased text, each sentence in new row, tokens seperated by single space");
System.out.println("\t-analyzer : one response line for each token; word followed by a tab-separated list of undisambiguated morphological tag options");
System.out.println("\nOther options:");
System.out.println("\t-stripped : lexical/nonessential parts of the tag are replaced with '-' to reduce sparsity.");
System.out.println("\t-features : in conll output, include the features that were used for training/tagging.");
System.out.println("\t-leta : in conll output, include extra features used for semantic frame analysis.");
System.out.println("\t-saveColumns : save extra columns from conll input.");
System.out.println("\t-unix-line-endings : use \\n line endings for output even on windows systems");
System.out.println("\t-keep-tags : preserve lines that start with '<' to enable xml-style metadata");
System.out.println("\t-output-separators : put sentence markup and paragraph markup");
System.out.println("\t-whitespace-marker : put tags where the tokens did not have whitespace between them");
System.out.println("\t-allow-empty-lines : do not quit on blank lines input (as per default)");
System.out.flush();
System.exit(0);
}
}
CMMClassifier morphoClassifier = CMMClassifier.getClassifier(morphoClassifierLocation);
PrintStream out = new PrintStream(System.out, true, "UTF8");
BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF8"));
switch(inputType) {
case CONLL:
for (List sentence : readCONLL(in)) {
outputSentence(morphoClassifier, out, sentence);
}
break;
default:
String s;
String sentence = "";
while ((s = in.readLine()) != null && (s.length() != 0 || !stopOnEmpty)) {
if (s.startsWith("<") && s.length()>1 && keepTags) {
if (outputType != outputTypes.lemmatizedText && outputType != outputTypes.lowercasedText) out.println(s);
continue;
}
if (s.length() == 0) continue;
boolean finished = true; // is sentence finished and ready to analyze
if (inputType != inputTypes.VERT) {
sentence = s;
} else {
if (s.startsWith("<") && s.length()>1) out.println(s);
else sentence = sentence + " " + s;
finished = s.startsWith("");
}
if (finished) {
processSentences(morphoClassifier, out, sentence.trim());
sentence = "";
}
}
if (inputType != inputTypes.VERT && sentence.length()>0) { //FIXME, not DRY
processSentences(morphoClassifier, out, sentence.trim());
}
}
in.close();
out.close();
}
/**
* Splits the text in sentences if needed, and forwards to outputSentance
* @param cmm - the tagger, needed to retrieve tagger features if they are requested
* @param out - a stream to output the data
* @param text - actual tokens to be output
*/
public static void processSentences(
CMMClassifier cmm, PrintStream out, String text) {
if (inputType == inputTypes.PARAGRAPH) { // split in multiple sentences
if (outputSeparators) out.println("");
LinkedList> sentences = Splitting.tokenizeSentences(LVMorphologyReaderAndWriter.getAnalyzer(), text, sentencelengthcap);
for (LinkedList sentence : sentences)
outputSentence(cmm, out, LVMorphologyReaderAndWriter.analyzeSentence2(sentence) );
if (outputSeparators)
out.println("
");
else
out.println();
} else outputSentence(cmm, out, LVMorphologyReaderAndWriter.analyzeSentence(text) ); // just a single sentence for other types
}
/**
* Outputs the tagged sentence according to the outputType set in this class
* @param cmm - the tagger, needed to retrieve tagger features if they are requested
* @param out - a stream to output the data
* @param sentence - actual tokens to be output
*/
public static void outputSentence(CMMClassifier cmm,
PrintStream out, List sentence) {
if (outputSeparators) out.println("");
if (outputType != outputTypes.lowercasedText && outputType != outputTypes.analyzerOptions) { //FIXME - a separate flag would be better
sentence = cmm.classify(sentence); // runs the actual morphotagging system
}
switch (outputType) {
case JSON:
out.println( output_JSON(sentence));
break;
case CONLL_X:
out.println( output_CONLL(sentence, cmm));
break;
case XML:
try {
output_XML(sentence, out);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
break;
case VISL_CG:
out.println( output_VISL(sentence));
break;
case lemmatizedText:
out.println( output_lemmatized(sentence));
break;
case lowercasedText:
out.println( output_lowercased(sentence));
break;
case analyzerOptions:
out.println( output_analyzer(sentence));
break;
default:
out.println( output_separated(sentence));
}
if (outputSeparators) out.println("");
out.flush();
}
private static String output_JSON(List tokens) {
LinkedList tokenJSON = new LinkedList();
for (CoreLabel word : tokens) {
String token = word.getString(TextAnnotation.class);
if (token.contains("")) continue;
Word analysis = word.get(LVMorphologyAnalysis.class);
Wordform maxwf = analysis.getMatchingWordform(word.getString(AnswerAnnotation.class), false);
if (mini_tag) maxwf.removeNonlexicalAttributes();
if (maxwf != null)
tokenJSON.add(String.format("{\"Word\":\"%s\",\"Tag\":\"%s\",\"Lemma\":\"%s\"}", JSONValue.escape(token), JSONValue.escape(maxwf.getTag()), JSONValue.escape(maxwf.getValue(AttributeNames.i_Lemma))));
else
tokenJSON.add(String.format("{\"Word\":\"%s\",\"Tag\":\"-\",\"Lemma\":\"%s\"}", JSONValue.escape(token), JSONValue.escape(token)));
}
String s = formatJSON(tokenJSON).toString();
return s;
}
private static void output_XML(List tokens, PrintStream straume) throws IOException {
PrintWriter w = new PrintWriter(straume);
for (CoreLabel word : tokens) {
String token = word.getString(TextAnnotation.class);
if (token.contains("")) continue;
Word analysis = word.get(LVMorphologyAnalysis.class);
Wordform maxwf = analysis.getMatchingWordform(word.getString(AnswerAnnotation.class), false);
if (mini_tag) maxwf.removeNonlexicalAttributes();
maxwf.addAttribute("Tag", maxwf.getTag());
maxwf.toXML(w);
// if (maxwf != null)
// tokenJSON.add(String.format("{\"Word\":\"%s\",\"Tag\":\"%s\",\"Lemma\":\"%s\"}", JSONValue.escape(token), JSONValue.escape(maxwf.getTag()), JSONValue.escape(maxwf.getValue(AttributeNames.i_Lemma))));
// else
// tokenJSON.add(String.format("{\"Word\":\"%s\",\"Tag\":\"-\",\"Lemma\":\"%s\"}", JSONValue.escape(token), JSONValue.escape(token)));
}
w.flush();
}
private static String output_CONLL(List tokens, CMMClassifier cmm){
StringBuilder s = new StringBuilder();
int counter = 1;
for (CoreLabel word : tokens) {
String token = word.getString(TextAnnotation.class);
if (token.contains("")) continue;
token = token.replace(' ', '_');
s.append(Integer.toString(counter));
s.append('\t');
s.append(token);
s.append('\t');
Word analysis = word.get(LVMorphologyAnalysis.class);
Wordform mainwf = analysis.getMatchingWordform(word.getString(AnswerAnnotation.class), false);
if (mainwf != null) {
String lemma = mainwf.getValue(AttributeNames.i_Lemma);
lemma = lemma.replace(' ', '_');
String answer = word.getString(AnswerAnnotation.class);
if (answer.trim().isEmpty()) answer = "_"; // no empty tag
s.append(lemma);
s.append('\t');
s.append(answer);
s.append('\t');
s.append(mainwf.getTag());
s.append('\t');
// Feature atribūtu filtri
if (mini_tag) mainwf.removeNonlexicalAttributes();
if (LETAfeatures) {
addLETAfeatures(mainwf);
// mainwf.removeAttribute(AttributeNames.i_SourceLemma); FIXME - atvasinātiem vārdiem šis var būt svarīgs, atpriedekļotas lemmas..
mainwf.removeTechnicalAttributes();
}
s.append(mainwf.pipeDelimitedEntries()); // Pievienojam vārda fīčas
if (features) { // visas fīčas, ko lietoja trenējot
Datum d = cmm.makeDatum(tokens, counter, cmm.featureFactory);
for (String feature : d.asFeatures()) {
s.append(feature.substring(0, feature.length()-2).replace(' ', '_')); // noņeam trailing |C kas tām fīčām tur ir
s.append('|');
}
}
s.deleteCharAt(s.length()-1); // noņemam peedeejo | separatoru, kas ir lieks
s.append('\t');
} else {
s.append(token);
s.append("\t_\t_\t_\t");
}
if (saveColumns) {
s.append(word.getString(ExtraColumnAnnotation.class));
} else {
String syntax = word.getString(ParentAnnotation.class);
if (syntax != null) {
s.append(syntax);
}
else s.append("_\t_\t_\t_");
}
s.append(eol);
counter++;
}
return s.toString();
}
private static String output_lemmatized(List tokens){
StringBuilder s = new StringBuilder();
for (CoreLabel word : tokens) {
String token = word.getString(TextAnnotation.class);
if (token.contains("")) continue;
token = token.replace(' ', '_');
Word analysis = word.get(LVMorphologyAnalysis.class);
Wordform mainwf = analysis.getMatchingWordform(word.getString(AnswerAnnotation.class), false);
if (mainwf != null && !token.isEmpty()) {
String lemma = mainwf.getValue(AttributeNames.i_Lemma);
if (saveCase && Character.isUpperCase(token.charAt(0))) lemma = lemma.substring(0,1).toUpperCase() + lemma.substring(1);
if (!saveCase) lemma=lemma.toLowerCase();
lemma = lemma.replace(' ', '_');
s.append(lemma);
s.append(' ');
} else {
System.err.println("Empty lemma");
}
}
return s.toString().trim();
}
private static String output_lowercased(List tokens){
StringBuilder s = new StringBuilder();
for (CoreLabel word : tokens) {
String token = word.getString(TextAnnotation.class);
if (token.contains("")) continue;
token = token.replace(' ', '_').toLowerCase();
s.append(token);
s.append(' ');
}
return s.toString().trim();
}
private static String output_analyzer(List tokens){
StringBuilder s = new StringBuilder();
for (CoreLabel word : tokens) {
String token = word.getString(TextAnnotation.class);
if (token.contains("")) continue;
token = token.replace(' ', '_');
s.append(token);
Word analysis = word.get(LVMorphologyAnalysis.class);
for (Wordform wf : analysis.wordforms) {
s.append(token_separator);
s.append(wf.getTag());
}
}
return s.toString().trim();
}
private static void addLETAfeatures(Wordform wf) {
String lemma = wf.getValue(AttributeNames.i_Lemma);
if (wf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.i_Number)) {
String numbercode = lemma.replaceAll("\\d", "0"); // uzskatam ka nav atšķirības starp skaitļiem ja ciparu skaits vienāds
wf.addAttribute("LETA_lemma", numbercode);
} else if (wf.isMatchingStrong(AttributeNames.i_CapitalLetters, AttributeNames.v_FirstUpper) && Dictionary.dict("surnames").contains(lemma))
wf.addAttribute("LETA_lemma", "_surname_");
else if (Dictionary.dict("vocations").contains(lemma))
wf.addAttribute("LETA_lemma", "_vocation_");
else if (Dictionary.dict("relations").contains(lemma))
wf.addAttribute("LETA_lemma", "_relationship_");
else if (Dictionary.dict("partijas").contains(lemma))
wf.addAttribute("LETA_lemma", "_party_"); // TODO - nočekot kā visā procesā sanāk ar case-sensitivity, te tas ir svarīgi
else if (Dictionary.dict("months").contains(lemma)) // TODO - te būtu jāčeko, lai personvārdi Marts un Jūlijs te neapēdas, ja ir ar lielo burtu ne teikuma sākumā
wf.addAttribute("LETA_lemma", "_month_");
else if (Dictionary.dict("common_lemmas").contains(lemma))
wf.addAttribute("LETA_lemma", lemma);
else wf.addAttribute("LETA_lemma", "_rare_");
}
// VISL CG format, as described in http://beta.visl.sdu.dk/cg3/chunked/streamformats.html#stream-vislcg
private static String output_VISL(List tokens) {
StringBuilder s = new StringBuilder();
for (CoreLabel word : tokens) {
String token = word.getString(TextAnnotation.class);
if (token.contains("")) continue;
token.replaceAll("\"", "\\\""); // VISL (seems to) require to escape quotes in their format. Possibly other escaping needs, not sure from their docs.
s.append(String.format("\"<%s>\"\n", token)); // <"They"> from the example
Word analysis = word.get(LVMorphologyAnalysis.class);
Wordform maxwf = analysis.getMatchingWordform(word.getString(AnswerAnnotation.class), false);
for (Wordform wf : analysis.wordforms) { // output the "cohort" in VISL-CG terms
String lemma = wf.getValue(AttributeNames.i_Lemma);
//Ad-hoc ... removing 'bookkeeping' attributes that seem useless for CG
wf.removeTechnicalAttributes();
wf.removeAttribute(AttributeNames.i_Lemma);
wf.removeAttribute(AttributeNames.i_SourceLemma);
lemma.replaceAll("\"", "\\\"");
s.append(String.format("\t\"%s\" ", lemma)); // <"They"> from the example
s.append(wf.getTag());
s.append(" ");
AttributeValues minimum = new AttributeValues(wf);
minimum.removeNonlexicalAttributes();
s.append(minimum.getTag());
s.append(" ");
for (Entry entry : wf.entrySet()) { // visi attributevalue paariishi
String key = entry.getKey();
String value = entry.getValue();
// For attributes with distinctive value names (like parts of speech) skip the attribute name for readability in CG
if ((!key.equalsIgnoreCase(AttributeNames.i_PartOfSpeech) &&
!key.equalsIgnoreCase(AttributeNames.i_Case) &&
!key.equalsIgnoreCase(AttributeNames.i_Number) &&
!key.equalsIgnoreCase(AttributeNames.i_Gender) &&
!key.equalsIgnoreCase(AttributeNames.i_NounType) &&
!key.equalsIgnoreCase(AttributeNames.i_Izteiksme) &&
!key.equalsIgnoreCase(AttributeNames.i_VerbType) &&
!key.equalsIgnoreCase(AttributeNames.i_Laiks) &&
!key.equalsIgnoreCase(AttributeNames.i_Transitivity) &&
!key.equalsIgnoreCase(AttributeNames.i_Declension) &&
!key.equalsIgnoreCase(AttributeNames.i_Definiteness) &&
!key.equalsIgnoreCase(AttributeNames.i_Lokaamiiba) &&
!key.equalsIgnoreCase(AttributeNames.i_AdjectiveType) &&
!key.equalsIgnoreCase(AttributeNames.i_SaikljaTips) &&
!key.equalsIgnoreCase(AttributeNames.i_Uzbuuve) &&
!key.equalsIgnoreCase(AttributeNames.i_PieturziimesTips) &&
!key.equalsIgnoreCase(AttributeNames.i_Voice) &&
!key.equalsIgnoreCase(AttributeNames.i_VvTips)
) || (value.equalsIgnoreCase(AttributeNames.v_NA) &&
!key.equalsIgnoreCase(AttributeNames.i_Anafora) &&
!key.equalsIgnoreCase(AttributeNames.i_Laiks)
)) {
s.append(key.replace(' ', '_'));
s.append('=');
}
s.append(value.replace(' ', '_'));
s.append(' ');
}
s.append(eol);
}
}
s.append("\"<>\"");
return s.toString();
}
private static String output_separated(List tokens){
StringBuilder s = new StringBuilder();
for (CoreLabel word : tokens) {
String token = word.getString(TextAnnotation.class);
if (token.contains("")) continue;
Word analysis = word.get(LVMorphologyAnalysis.class);
Wordform mainwf = analysis.getMatchingWordform(word.getString(AnswerAnnotation.class), false);
if (s.length()>0) s.append(token_separator);
if (whitespaceMarker && mainwf.isMatchingStrong(AttributeNames.i_WhitespaceBefore, "")) {
s.append(" ");
s.append(token_separator);
}
if (outputType == outputTypes.MOSES) token = token.replace(' ', '_');
s.append(token);
s.append(field_separator);
if (mainwf != null) {
if (mini_tag) mainwf.removeNonlexicalAttributes();
s.append(mainwf.getTag());
s.append(field_separator);
String lemma = mainwf.getValue(AttributeNames.i_Lemma);
if (outputType == outputTypes.MOSES) lemma = lemma.replace(' ', '_');
s.append(lemma);
} else s.append(field_separator);
/*
mainwf = word.get(LVMorphologyAnalysisBest.class);
if (mainwf != null) {
s.append("Single-token suggestion:\t");
s.append(mainwf.getTag());
s.append("\t");
s.append(mainwf.getValue(AttributeNames.i_Lemma));
s.append("\t");
}
s.append("\n");
if (all_options)
s.append(word.toTabSep(statistics, probabilities));
else s.append(word.toTabSepsingle(statistics)); */
}
tokens = null;
return s.toString();
}
private static StringBuilder formatJSON(Collection tags) {
Iterator i = tags.iterator();
StringBuilder out = new StringBuilder("[");
while (i.hasNext()) {
out.append(i.next());
if (i.hasNext()) out.append(", ");
}
out.append("]");
return out;
}
public static List> readCONLL(BufferedReader in) throws IOException {
String s;
List sentence = new LinkedList();
List> result = new LinkedList>();
CoreLabel stag = new CoreLabel();
stag.set(TextAnnotation.class, "");
sentence.add(stag);
while ((s = in.readLine()) != null) {
if (s.trim().length() > 0) {
String[] fields = s.split("\t");
String token = fields[1];
if (!token.equalsIgnoreCase("_")) token = token.replace('_', ' ');
String extraColumns = "";
if (saveColumns) {
for (int field_i = 6; field_i < fields.length; field_i++) extraColumns += fields[field_i] + "\t";
extraColumns.trim();
}
String syntax = "";
if (fields.length >= 10) syntax = fields[6] + "\t" + fields[7] + "\t" + fields[8] + "\t" + fields[9];
CoreLabel word = new CoreLabel();
word.set(TextAnnotation.class, token);
word.set(ParentAnnotation.class, syntax);
word.set(ExtraColumnAnnotation.class, extraColumns);
sentence.add(word);
} else {
stag = new CoreLabel();
stag.set(TextAnnotation.class, "");
sentence.add(stag);
result.add(LVMorphologyReaderAndWriter.analyzeLabels(sentence));
sentence = new LinkedList();
stag = new CoreLabel();
stag.set(TextAnnotation.class, "");
sentence.add(stag);
}
}
if (sentence.size() > 0) {
stag = new CoreLabel();
stag.set(TextAnnotation.class, "");
sentence.add(stag);
result.add(LVMorphologyReaderAndWriter.analyzeLabels(sentence));
}
return result;
}
}