lv.lumii.morphotagger.MorphoPipe Maven / Gradle / Ivy

/*******************************************************************************
 * Copyright 2012,2013,2014 Institute of Mathematics and Computer Science, University of Latvia
 * Author: Pēteris Paikens
 * 
 *     This program is free software: you can redistribute it and/or modify
 *     it under the terms of the GNU General Public License as published by
 *     the Free Software Foundation, either version 3 of the License, or
 *     (at your option) any later version.
 * 
 *     This program is distributed in the hope that it will be useful,
 *     but WITHOUT ANY WARRANTY; without even the implied warranty of
 *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *     GNU General Public License for more details.
 * 
 *     You should have received a copy of the GNU General Public License
 *     along with this program.  If not, see .
 *******************************************************************************/
package lv.lumii.morphotagger;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map.Entry;

import lv.semti.morphology.analyzer.Splitting;
import lv.semti.morphology.analyzer.Word;
import lv.semti.morphology.analyzer.Wordform;
import lv.semti.morphology.attributes.AttributeNames;
import lv.semti.morphology.attributes.AttributeValues;

import org.json.simple.JSONValue;

import edu.stanford.nlp.ie.ner.CMMClassifier;
import edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.ExtraColumnAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.LVMorphologyAnalysis;
import edu.stanford.nlp.ling.CoreAnnotations.ParentAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Datum;
import edu.stanford.nlp.sequences.LVMorphologyReaderAndWriter;

// Copied/pasted/mangled from transliteration webservices java project

public class MorphoPipe {
	private enum inputTypes {SENTENCE, PARAGRAPH, VERT, CONLL, JSON};
	private enum outputTypes {JSON, TAB, VERT, MOSES, CONLL_X, XML, VISL_CG, lemmatizedText, lowercasedText, analyzerOptions};

	private static String eol = System.getProperty("line.separator");
	private static String field_separator = "\t";
	private static String token_separator = eol;
	
	private static boolean mini_tag = false;		
	private static boolean features = false;	
	private static boolean LETAfeatures = false;
	private static inputTypes inputType = inputTypes.PARAGRAPH;
	private static outputTypes outputType = outputTypes.CONLL_X;
	//private static int sentencelengthcap = Splitting.DEFAULT_SENTENCE_LENGTH_CAP;
	private static int sentencelengthcap = 100; 
	private static boolean saveColumns = false;
	private static boolean keepTags = false;
	private static boolean saveCase = false; // for lemmatized text output format
	private static boolean outputSeparators = false; //  for sentences,  for paragraphs
    private static boolean whitespaceMarker = false;
	private static boolean stopOnEmpty = true; // quit on empty line
	
	private static String morphoClassifierLocation = "models/lv-morpho-model.ser.gz"; //FIXME - make it configurable
	
	public static void main(String[] args) throws Exception {
		
		for (int i=0; i. Any XML-style tags are echoed as-is. \n\t\tNB! sentences are retokenized, the number of tokens may be different.");
				System.out.println("\t-conll-in : CONLL shared task data format - one line per token, with tab-delimited columns, sentences separated by blank lines.");
				System.out.println("\t-json-in : one line per sentence, each line contains a single json array of strings-tokens.");
				System.out.println("\nOutput formats");
				System.out.println("\tDefault : JSON. Each sentence is returned as a list of dicts, each dict contains elements 'Word', 'Tag' and 'Lemma'.");
				System.out.println("\t-tab : one response line for each query line; tab-separated lists of word, tag and lemma.");
				System.out.println("\t-vert : one response line for each token; tab-separated lists of word, tag and lemma.");
				System.out.println("\t-moses : one response line for each token; pipe-separated lists of word, tag and lemma.");
				System.out.println("\t-conll-x : CONLL-X shared task data format - one line per token, with tab-delimited columns, sentences separated by blank lines.");
				System.out.println("\t-xml : one xml word per line");
				System.out.println("\t-visl-cg : output format for VISL constraint grammar tool");
				System.out.println("\t-lemmatized-text : output lowercase lemmatized text, each sentence in new row, tokens seperated by single space");
				System.out.println("\t-lowercased-text : output lowercased text, each sentence in new row, tokens seperated by single space");
                System.out.println("\t-analyzer : one response line for each token; word followed by a tab-separated list of undisambiguated morphological tag options");
				System.out.println("\nOther options:");
				System.out.println("\t-stripped : lexical/nonessential parts of the tag are replaced with '-' to reduce sparsity.");
				System.out.println("\t-features : in conll output, include the features that were used for training/tagging.");
				System.out.println("\t-leta : in conll output, include extra features used for semantic frame analysis.");
				System.out.println("\t-saveColumns : save extra columns from conll input.");
				System.out.println("\t-unix-line-endings : use \\n line endings for output even on windows systems");
				System.out.println("\t-keep-tags : preserve lines that start with '<' to enable xml-style metadata");
				System.out.println("\t-output-separators : put  sentence markup and 
 paragraph markup");
				System.out.println("\t-whitespace-marker : put  tags where the tokens did not have whitespace between them");
                System.out.println("\t-allow-empty-lines : do not quit on blank lines input (as per default)");
				System.out.flush();
				System.exit(0);
			}
		}
						
		CMMClassifier morphoClassifier = CMMClassifier.getClassifier(morphoClassifierLocation);
			
		PrintStream out = new PrintStream(System.out, true, "UTF8");
		BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF8"));
		
		switch(inputType) {
		case CONLL:
			for (List sentence : readCONLL(in)) {
		    	outputSentence(morphoClassifier, out, sentence);
			}
			break;
		default:
		    String s;
		    String sentence = "";
		    while ((s = in.readLine()) != null && (s.length() != 0 || !stopOnEmpty)) {
		    	if (s.startsWith("<") && s.length()>1 && keepTags) {
		    		if (outputType != outputTypes.lemmatizedText && outputType != outputTypes.lowercasedText) out.println(s);
		    		continue;
		    	}
                if (s.length() == 0) continue;
		    	boolean finished = true; // is sentence finished and ready to analyze
		    	if (inputType != inputTypes.VERT) {		    		
		    		sentence = s;
		    	} else {
		    		if (s.startsWith("<") && s.length()>1) out.println(s);
		    		else sentence = sentence + " " + s;
		    		finished = s.startsWith("");
		    	}	    	
		    	if (finished) {
		    		processSentences(morphoClassifier, out, sentence.trim());
			    	sentence = "";
		    	}
		    }
	    	if (inputType != inputTypes.VERT && sentence.length()>0) { //FIXME, not DRY
	    		processSentences(morphoClassifier, out, sentence.trim());
	    	}	    			
		}
		in.close();
		out.close();
	}

	/**
	 * Splits the text in sentences if needed, and forwards to outputSentance
	 * @param cmm - the tagger, needed to retrieve tagger features if they are requested
	 * @param out - a stream to output the data
	 * @param text - actual tokens to be output
	 */
	public static void processSentences(
			CMMClassifier cmm, PrintStream out, String text) {
		
		if (inputType == inputTypes.PARAGRAPH) { // split in multiple sentences
			if (outputSeparators) out.println("");
			LinkedList> sentences = Splitting.tokenizeSentences(LVMorphologyReaderAndWriter.getAnalyzer(), text, sentencelengthcap);
			for (LinkedList sentence : sentences) 
				outputSentence(cmm, out, LVMorphologyReaderAndWriter.analyzeSentence2(sentence) );
			if (outputSeparators) 
				out.println("");
			else 
				out.println();
		} else outputSentence(cmm, out, LVMorphologyReaderAndWriter.analyzeSentence(text) ); // just a single sentence for other types
	}

	/**
	 * Outputs the tagged sentence according to the outputType set in this class
	 * @param cmm - the tagger, needed to retrieve tagger features if they are requested
	 * @param out - a stream to output the data
	 * @param sentence - actual tokens to be output
	 */
	public static void outputSentence(CMMClassifier cmm,
			PrintStream out, List sentence) {
		if (outputSeparators) out.println("");

        if (outputType != outputTypes.lowercasedText && outputType != outputTypes.analyzerOptions) { //FIXME - a separate flag would be better
            sentence = cmm.classify(sentence); // runs the actual morphotagging system
        }

		switch (outputType) {
		case JSON:
			out.println( output_JSON(sentence));
			break;
		case CONLL_X:
			out.println( output_CONLL(sentence, cmm));
			break;
		case XML:
			try {
				output_XML(sentence, out);
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
			break;
		case VISL_CG:
			out.println( output_VISL(sentence));
			break;
		case lemmatizedText:
			out.println( output_lemmatized(sentence));
			break;
        case lowercasedText:
            out.println( output_lowercased(sentence));
            break;
        case analyzerOptions:
            out.println( output_analyzer(sentence));
            break;
		default:
			out.println( output_separated(sentence));	    
		}
		if (outputSeparators) out.println("");
		out.flush();
	}	
	
	private static String output_JSON(List tokens) {		
		LinkedList tokenJSON = new LinkedList();
		
		for (CoreLabel word : tokens) {
			String token = word.getString(TextAnnotation.class);
			if (token.contains("")) continue;
			Word analysis = word.get(LVMorphologyAnalysis.class);
			Wordform maxwf = analysis.getMatchingWordform(word.getString(AnswerAnnotation.class), false);
			if (mini_tag) maxwf.removeNonlexicalAttributes();
			if (maxwf != null)
				tokenJSON.add(String.format("{\"Word\":\"%s\",\"Tag\":\"%s\",\"Lemma\":\"%s\"}", JSONValue.escape(token), JSONValue.escape(maxwf.getTag()), JSONValue.escape(maxwf.getValue(AttributeNames.i_Lemma))));
			else 
				tokenJSON.add(String.format("{\"Word\":\"%s\",\"Tag\":\"-\",\"Lemma\":\"%s\"}", JSONValue.escape(token), JSONValue.escape(token)));			
		}
		
		String s = formatJSON(tokenJSON).toString();

		return s;
	}
	
	private static void output_XML(List tokens, PrintStream straume) throws IOException {
		PrintWriter w = new PrintWriter(straume);
		for (CoreLabel word : tokens) {
			String token = word.getString(TextAnnotation.class);
			if (token.contains("")) continue;
			Word analysis = word.get(LVMorphologyAnalysis.class);
			Wordform maxwf = analysis.getMatchingWordform(word.getString(AnswerAnnotation.class), false);
			if (mini_tag) maxwf.removeNonlexicalAttributes();
			maxwf.addAttribute("Tag", maxwf.getTag());
			maxwf.toXML(w);
//			if (maxwf != null)
//				tokenJSON.add(String.format("{\"Word\":\"%s\",\"Tag\":\"%s\",\"Lemma\":\"%s\"}", JSONValue.escape(token), JSONValue.escape(maxwf.getTag()), JSONValue.escape(maxwf.getValue(AttributeNames.i_Lemma))));
//			else 
//				tokenJSON.add(String.format("{\"Word\":\"%s\",\"Tag\":\"-\",\"Lemma\":\"%s\"}", JSONValue.escape(token), JSONValue.escape(token)));			
		}		
		w.flush();
	}

	private static String output_CONLL(List tokens, CMMClassifier cmm){
		StringBuilder s = new StringBuilder();

		int counter = 1;
		for (CoreLabel word : tokens) {
			String token = word.getString(TextAnnotation.class);
			if (token.contains("")) continue;
			token = token.replace(' ', '_');
			
			s.append(Integer.toString(counter));
			s.append('\t');
			s.append(token);
			s.append('\t');
			Word analysis = word.get(LVMorphologyAnalysis.class);
			Wordform mainwf = analysis.getMatchingWordform(word.getString(AnswerAnnotation.class), false); 
			if (mainwf != null) {
				String lemma = mainwf.getValue(AttributeNames.i_Lemma);
				lemma = lemma.replace(' ', '_');
				String answer = word.getString(AnswerAnnotation.class);
				if (answer.trim().isEmpty()) answer = "_"; // no empty tag
				s.append(lemma);
				s.append('\t');
				s.append(answer);
				s.append('\t');
				s.append(mainwf.getTag());
				s.append('\t');

				// Feature atribūtu filtri
				if (mini_tag) mainwf.removeNonlexicalAttributes();
				if (LETAfeatures) {
					addLETAfeatures(mainwf);
					// mainwf.removeAttribute(AttributeNames.i_SourceLemma); FIXME - atvasinātiem vārdiem šis var būt svarīgs, atpriedekļotas lemmas..
					mainwf.removeTechnicalAttributes();
				}
				
				s.append(mainwf.pipeDelimitedEntries()); // Pievienojam vārda fīčas
				
				if (features) { // visas fīčas, ko lietoja trenējot
					Datum d = cmm.makeDatum(tokens, counter, cmm.featureFactory);
					for (String feature : d.asFeatures()) {
						s.append(feature.substring(0, feature.length()-2).replace(' ', '_')); // noņeam trailing |C kas tām fīčām tur ir
						s.append('|');
					}
				}
				s.deleteCharAt(s.length()-1); // noņemam peedeejo | separatoru, kas ir lieks
				s.append('\t');
				
			} else {
				s.append(token); 
				s.append("\t_\t_\t_\t");
			}
			if (saveColumns) {
				s.append(word.getString(ExtraColumnAnnotation.class));
			} else {
				String syntax = word.getString(ParentAnnotation.class);
				if (syntax != null) {
					s.append(syntax);
				}
				else s.append("_\t_\t_\t_");
			}
			s.append(eol);
			counter++;
		}
		
		return s.toString();
	}
	
	private static String output_lemmatized(List tokens){
		StringBuilder s = new StringBuilder();
		
		for (CoreLabel word : tokens) {
			String token = word.getString(TextAnnotation.class);
			if (token.contains("")) continue;
			token = token.replace(' ', '_');
			
			Word analysis = word.get(LVMorphologyAnalysis.class);
			Wordform mainwf = analysis.getMatchingWordform(word.getString(AnswerAnnotation.class), false); 
			if (mainwf != null && !token.isEmpty()) {
				String lemma = mainwf.getValue(AttributeNames.i_Lemma);
				if (saveCase && Character.isUpperCase(token.charAt(0))) lemma = lemma.substring(0,1).toUpperCase() + lemma.substring(1);
				if (!saveCase) lemma=lemma.toLowerCase();
				lemma = lemma.replace(' ', '_');
				s.append(lemma);
				s.append(' ');
				
			} else {
				System.err.println("Empty lemma");
			}
		}
		return s.toString().trim();
	}

    private static String output_lowercased(List tokens){
        StringBuilder s = new StringBuilder();

        for (CoreLabel word : tokens) {
            String token = word.getString(TextAnnotation.class);
            if (token.contains("")) continue;
            token = token.replace(' ', '_').toLowerCase();
            s.append(token);
            s.append(' ');
        }
        return s.toString().trim();
    }

    private static String output_analyzer(List tokens){
        StringBuilder s = new StringBuilder();

        for (CoreLabel word : tokens) {
            String token = word.getString(TextAnnotation.class);
            if (token.contains("")) continue;
            token = token.replace(' ', '_');
            s.append(token);
            Word analysis = word.get(LVMorphologyAnalysis.class);
            for (Wordform wf : analysis.wordforms) {
                s.append(token_separator);
                s.append(wf.getTag());
            }
        }
        return s.toString().trim();
    }

	private static void addLETAfeatures(Wordform wf) {
		String lemma = wf.getValue(AttributeNames.i_Lemma);
				
		if (wf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.i_Number)) {
			String numbercode = lemma.replaceAll("\\d", "0"); // uzskatam ka nav atšķirības starp skaitļiem ja ciparu skaits vienāds
			wf.addAttribute("LETA_lemma", numbercode);
		} else if (wf.isMatchingStrong(AttributeNames.i_CapitalLetters, AttributeNames.v_FirstUpper) && Dictionary.dict("surnames").contains(lemma))
			wf.addAttribute("LETA_lemma", "_surname_");
		else if (Dictionary.dict("vocations").contains(lemma))
			wf.addAttribute("LETA_lemma", "_vocation_");
		else if (Dictionary.dict("relations").contains(lemma))
			wf.addAttribute("LETA_lemma", "_relationship_");
		else if (Dictionary.dict("partijas").contains(lemma))
			wf.addAttribute("LETA_lemma", "_party_"); // TODO - nočekot kā visā procesā sanāk ar case-sensitivity, te tas ir svarīgi
		else if (Dictionary.dict("months").contains(lemma)) // TODO - te būtu jāčeko, lai personvārdi Marts un Jūlijs te neapēdas, ja ir ar lielo burtu ne teikuma sākumā 
			wf.addAttribute("LETA_lemma", "_month_"); 
		else if (Dictionary.dict("common_lemmas").contains(lemma)) 
			wf.addAttribute("LETA_lemma", lemma);
		else wf.addAttribute("LETA_lemma", "_rare_");		
	}

	// VISL CG format, as described in http://beta.visl.sdu.dk/cg3/chunked/streamformats.html#stream-vislcg
	private static String output_VISL(List tokens) {		
		StringBuilder s = new StringBuilder();
		
		for (CoreLabel word : tokens) {
			String token = word.getString(TextAnnotation.class);
			if (token.contains("")) continue;
			
			token.replaceAll("\"", "\\\""); // VISL (seems to) require to escape quotes in their format. Possibly other escaping needs, not sure from their docs.
			
			s.append(String.format("\"<%s>\"\n", token)); // <"They"> from the example
			
			Word analysis = word.get(LVMorphologyAnalysis.class);
			Wordform maxwf = analysis.getMatchingWordform(word.getString(AnswerAnnotation.class), false);
			for (Wordform wf : analysis.wordforms) { // output the "cohort" in VISL-CG terms
				String lemma = wf.getValue(AttributeNames.i_Lemma);
				//Ad-hoc ... removing 'bookkeeping' attributes that seem useless for CG
				wf.removeTechnicalAttributes();
				wf.removeAttribute(AttributeNames.i_Lemma);
				wf.removeAttribute(AttributeNames.i_SourceLemma);
				
				lemma.replaceAll("\"", "\\\"");
				s.append(String.format("\t\"%s\" ", lemma)); // <"They"> from the example
				s.append(wf.getTag());
				s.append(" ");
				AttributeValues minimum = new AttributeValues(wf);
				minimum.removeNonlexicalAttributes();
				s.append(minimum.getTag());
				s.append(" ");
				for (Entry entry : wf.entrySet()) { // visi attributevalue paariishi
					String key = entry.getKey();
					String value = entry.getValue();
					// For attributes with distinctive value names (like parts of speech) skip the attribute name for readability in CG
					if ((!key.equalsIgnoreCase(AttributeNames.i_PartOfSpeech) && 
						!key.equalsIgnoreCase(AttributeNames.i_Case) && 
						!key.equalsIgnoreCase(AttributeNames.i_Number) &&
						!key.equalsIgnoreCase(AttributeNames.i_Gender) &&
						!key.equalsIgnoreCase(AttributeNames.i_NounType) &&
						!key.equalsIgnoreCase(AttributeNames.i_Izteiksme) &&
						!key.equalsIgnoreCase(AttributeNames.i_VerbType) &&
						!key.equalsIgnoreCase(AttributeNames.i_Laiks) &&
						!key.equalsIgnoreCase(AttributeNames.i_Transitivity) &&
						!key.equalsIgnoreCase(AttributeNames.i_Declension) &&						
						!key.equalsIgnoreCase(AttributeNames.i_Definiteness) &&
						!key.equalsIgnoreCase(AttributeNames.i_Lokaamiiba) &&
						!key.equalsIgnoreCase(AttributeNames.i_AdjectiveType) &&
						!key.equalsIgnoreCase(AttributeNames.i_SaikljaTips) &&
						!key.equalsIgnoreCase(AttributeNames.i_Uzbuuve) &&
						!key.equalsIgnoreCase(AttributeNames.i_PieturziimesTips) &&
						!key.equalsIgnoreCase(AttributeNames.i_Voice) &&
						!key.equalsIgnoreCase(AttributeNames.i_VvTips)
						) || (value.equalsIgnoreCase(AttributeNames.v_NA) &&
								!key.equalsIgnoreCase(AttributeNames.i_Anafora) &&
								!key.equalsIgnoreCase(AttributeNames.i_Laiks)
						)) {
						 s.append(key.replace(' ', '_')); 
						 s.append('=');
					}
					 s.append(value.replace(' ', '_'));
					 s.append(' ');
				}
				s.append(eol);
			}
		}
		
		s.append("\"<>\"");
		
		return s.toString();
	}
	
	private static String output_separated(List tokens){
		StringBuilder s = new StringBuilder();
		
		for (CoreLabel word : tokens) {
			String token = word.getString(TextAnnotation.class);
			if (token.contains("")) continue;
            Word analysis = word.get(LVMorphologyAnalysis.class);
            Wordform mainwf = analysis.getMatchingWordform(word.getString(AnswerAnnotation.class), false);

            if (s.length()>0) s.append(token_separator);
            if (whitespaceMarker && mainwf.isMatchingStrong(AttributeNames.i_WhitespaceBefore, "")) {
                s.append("");
                s.append(token_separator);
            }

            if (outputType == outputTypes.MOSES) token = token.replace(' ', '_');
			s.append(token);
			s.append(field_separator);

			if (mainwf != null) {
				if (mini_tag) mainwf.removeNonlexicalAttributes();
				s.append(mainwf.getTag());
				s.append(field_separator);
				String lemma = mainwf.getValue(AttributeNames.i_Lemma);
				if (outputType == outputTypes.MOSES) lemma = lemma.replace(' ', '_');
				s.append(lemma);
			} else s.append(field_separator); 
			/*
			mainwf = word.get(LVMorphologyAnalysisBest.class);
			if (mainwf != null) {
				s.append("Single-token suggestion:\t");
				s.append(mainwf.getTag());
				s.append("\t");
				s.append(mainwf.getValue(AttributeNames.i_Lemma));
				s.append("\t");
			}
			s.append("\n");
			if (all_options)
					s.append(word.toTabSep(statistics, probabilities));
			else s.append(word.toTabSepsingle(statistics)); */
		}
		
		tokens = null;
		return s.toString();
	}
	
	private static StringBuilder formatJSON(Collection tags) {
		Iterator i = tags.iterator();
		StringBuilder out = new StringBuilder("[");
		while (i.hasNext()) {
			out.append(i.next());
			if (i.hasNext()) out.append(", ");
		}
		out.append("]");
		return out;
	}
	
	
	public static List> readCONLL(BufferedReader in) throws IOException {
		String s;
	    List sentence = new LinkedList();
	    List> result = new LinkedList>();
	    
	    CoreLabel stag = new CoreLabel();
		stag.set(TextAnnotation.class, "");
		sentence.add(stag);
	    
	    while ((s = in.readLine()) != null) {
	    	if (s.trim().length() > 0) {
	    		String[] fields = s.split("\t");
	    		String token = fields[1];
	    		if (!token.equalsIgnoreCase("_")) token = token.replace('_', ' ');
	    		String extraColumns = "";
	    		if (saveColumns) {
	    			for (int field_i = 6; field_i < fields.length; field_i++) extraColumns += fields[field_i] + "\t";
	    			extraColumns.trim();
	    		}	    		
	    		String syntax = "";
	    		if (fields.length >= 10) syntax = fields[6] + "\t" + fields[7] + "\t" + fields[8] + "\t" + fields[9];

	    		CoreLabel word = new CoreLabel();
				word.set(TextAnnotation.class, token);
				word.set(ParentAnnotation.class, syntax);
				word.set(ExtraColumnAnnotation.class, extraColumns);
	    		sentence.add(word);
	    	} else {
	    		stag = new CoreLabel();
	    		stag.set(TextAnnotation.class, "");
	    		sentence.add(stag);
	    		
	    		result.add(LVMorphologyReaderAndWriter.analyzeLabels(sentence));
	    		
	    		sentence = new LinkedList();
	    		stag = new CoreLabel();
	    		stag.set(TextAnnotation.class, "");
	    		sentence.add(stag);
	    	}
	    }
	    if (sentence.size() > 0) {
	    	stag = new CoreLabel();
			stag.set(TextAnnotation.class, "");
			sentence.add(stag);
	    	result.add(LVMorphologyReaderAndWriter.analyzeLabels(sentence));
	    }
	    		
		return result;
	}

}