All Downloads are FREE. Search and download functionalities are using the official Maven repository.

marytts.modules.OpenNLPPosTagger Maven / Gradle / Ivy

The newest version!
/**
 * Copyright 2007 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * This file is part of MARY TTS.
 *
 * MARY TTS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see .
 *
 */
package marytts.modules;

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;

import marytts.datatypes.MaryData;
import marytts.datatypes.MaryDataType;
import marytts.datatypes.MaryXML;
import marytts.server.MaryProperties;
import marytts.util.MaryUtils;
import marytts.util.dom.MaryDomUtils;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;

import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.traversal.NodeIterator;
import org.w3c.dom.traversal.TreeWalker;

/**
 * Part-of-speech tagger using OpenNLP.
 *
 * @author Marc Schröder
 */

public class OpenNLPPosTagger extends InternalModule {
	private String propertyPrefix;
	private POSTaggerME tagger;
	private Map posMapper = null;

	/**
	 * Constructor which can be directly called from init info in the config file. Different languages can call this code with
	 * different settings.
	 *
	 * @param locale
	 *            a locale string, e.g. "en"
	 * @param propertyPrefix
	 *            propertyPrefix
	 * @throws Exception
	 *             Exception
	 */
	public OpenNLPPosTagger(String locale, String propertyPrefix) throws Exception {
		super("OpenNLPPosTagger", MaryDataType.WORDS, MaryDataType.PARTSOFSPEECH, MaryUtils.string2locale(locale));
		if (!propertyPrefix.endsWith("."))
			propertyPrefix = propertyPrefix + ".";
		this.propertyPrefix = propertyPrefix;
	}

	public void startup() throws Exception {
		super.startup();

		InputStream modelStream = MaryProperties.needStream(propertyPrefix + "model");
		InputStream posMapperStream = MaryProperties.getStream(propertyPrefix + "posMap");

		tagger = new POSTaggerME(new POSModel(modelStream));
		modelStream.close();
		if (posMapperStream != null) {
			posMapper = new HashMap();
			BufferedReader br = new BufferedReader(new InputStreamReader(posMapperStream, "UTF-8"));
			String line;
			while ((line = br.readLine()) != null) {
				// skip comments and empty lines
				if (line.startsWith("#") || line.trim().equals(""))
					continue;
				// Entry format: POS GPOS, i.e. two space-separated entries per line
				StringTokenizer st = new StringTokenizer(line);
				String pos = st.nextToken();
				String gpos = st.nextToken();
				posMapper.put(pos, gpos);
			}
			posMapperStream.close();
		}
	}

	@SuppressWarnings("unchecked")
	public MaryData process(MaryData d) throws Exception {

		Document doc = d.getDocument();
		NodeIterator sentenceIt = MaryDomUtils.createNodeIterator(doc, doc, MaryXML.SENTENCE);
		Element sentence;
		while ((sentence = (Element) sentenceIt.nextNode()) != null) {
			TreeWalker tokenIt = MaryDomUtils.createTreeWalker(sentence, MaryXML.TOKEN);
			List tokens = new ArrayList();
			Element t;
			while ((t = (Element) tokenIt.nextNode()) != null) {
				tokens.add(MaryDomUtils.tokenText(t));
			}
			if (tokens.size() == 1) {
				tokens.add(".");
			}
			List partsOfSpeech = null;
			synchronized (this) {
				partsOfSpeech = tagger.tag(tokens);
			}
			tokenIt.setCurrentNode(sentence); // reset treewalker so we can walk through once again
			Iterator posIt = partsOfSpeech.iterator();
			while ((t = (Element) tokenIt.nextNode()) != null) {
				assert posIt.hasNext();
				String pos = posIt.next();
				if (t.hasAttribute("pos")) {
					continue;
				}
				if (posMapper != null) {
					String gpos = posMapper.get(pos);
					if (gpos == null)
						logger.warn("POS map file incomplete: do not know how to map '" + pos + "'");
					else
						pos = gpos;
				}
				t.setAttribute("pos", pos);
			}
		}

		MaryData output = new MaryData(outputType(), d.getLocale());
		output.setDocument(doc);
		return output;
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy