marytts.modules.OpenNLPPosTagger Maven / Gradle / Ivy
The newest version!
/**
* Copyright 2007 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see .
*
*/
package marytts.modules;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import marytts.datatypes.MaryData;
import marytts.datatypes.MaryDataType;
import marytts.datatypes.MaryXML;
import marytts.server.MaryProperties;
import marytts.util.MaryUtils;
import marytts.util.dom.MaryDomUtils;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.traversal.NodeIterator;
import org.w3c.dom.traversal.TreeWalker;
/**
* Part-of-speech tagger using OpenNLP.
*
* @author Marc Schröder
*/
public class OpenNLPPosTagger extends InternalModule {
private String propertyPrefix;
private POSTaggerME tagger;
private Map posMapper = null;
/**
* Constructor which can be directly called from init info in the config file. Different languages can call this code with
* different settings.
*
* @param locale
* a locale string, e.g. "en"
* @param propertyPrefix
* propertyPrefix
* @throws Exception
* Exception
*/
public OpenNLPPosTagger(String locale, String propertyPrefix) throws Exception {
super("OpenNLPPosTagger", MaryDataType.WORDS, MaryDataType.PARTSOFSPEECH, MaryUtils.string2locale(locale));
if (!propertyPrefix.endsWith("."))
propertyPrefix = propertyPrefix + ".";
this.propertyPrefix = propertyPrefix;
}
public void startup() throws Exception {
super.startup();
InputStream modelStream = MaryProperties.needStream(propertyPrefix + "model");
InputStream posMapperStream = MaryProperties.getStream(propertyPrefix + "posMap");
tagger = new POSTaggerME(new POSModel(modelStream));
modelStream.close();
if (posMapperStream != null) {
posMapper = new HashMap();
BufferedReader br = new BufferedReader(new InputStreamReader(posMapperStream, "UTF-8"));
String line;
while ((line = br.readLine()) != null) {
// skip comments and empty lines
if (line.startsWith("#") || line.trim().equals(""))
continue;
// Entry format: POS GPOS, i.e. two space-separated entries per line
StringTokenizer st = new StringTokenizer(line);
String pos = st.nextToken();
String gpos = st.nextToken();
posMapper.put(pos, gpos);
}
posMapperStream.close();
}
}
@SuppressWarnings("unchecked")
public MaryData process(MaryData d) throws Exception {
Document doc = d.getDocument();
NodeIterator sentenceIt = MaryDomUtils.createNodeIterator(doc, doc, MaryXML.SENTENCE);
Element sentence;
while ((sentence = (Element) sentenceIt.nextNode()) != null) {
TreeWalker tokenIt = MaryDomUtils.createTreeWalker(sentence, MaryXML.TOKEN);
List tokens = new ArrayList();
Element t;
while ((t = (Element) tokenIt.nextNode()) != null) {
tokens.add(MaryDomUtils.tokenText(t));
}
if (tokens.size() == 1) {
tokens.add(".");
}
List partsOfSpeech = null;
synchronized (this) {
partsOfSpeech = tagger.tag(tokens);
}
tokenIt.setCurrentNode(sentence); // reset treewalker so we can walk through once again
Iterator posIt = partsOfSpeech.iterator();
while ((t = (Element) tokenIt.nextNode()) != null) {
assert posIt.hasNext();
String pos = posIt.next();
if (t.hasAttribute("pos")) {
continue;
}
if (posMapper != null) {
String gpos = posMapper.get(pos);
if (gpos == null)
logger.warn("POS map file incomplete: do not know how to map '" + pos + "'");
else
pos = gpos;
}
t.setAttribute("pos", pos);
}
}
MaryData output = new MaryData(outputType(), d.getLocale());
output.setDocument(doc);
return output;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy