marytts.modules.JTokeniser Maven / Gradle / Ivy
/**
* Copyright 2000-2006 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see .
*
*/
package marytts.modules;
import java.util.Locale;
import java.util.Properties;
import marytts.datatypes.MaryData;
import marytts.datatypes.MaryDataType;
import marytts.datatypes.MaryXML;
import marytts.util.dom.DomUtils;
import marytts.util.dom.MaryDomUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Text;
import org.w3c.dom.traversal.DocumentTraversal;
import org.w3c.dom.traversal.NodeFilter;
import org.w3c.dom.traversal.NodeIterator;
import de.dfki.lt.tools.tokenizer.JTok;
import de.dfki.lt.tools.tokenizer.annotate.AnnotatedString;
import de.dfki.lt.tools.tokenizer.annotate.FastAnnotatedString;
/**
* @author Marc Schröder
*
*
*/
public class JTokeniser extends InternalModule {
public static final int TOKEN_MAXLENGTH = 100;
private JTok jtok;
private String jtokLocale;
public JTokeniser() {
this((Locale) null);
}
public JTokeniser(String locale) {
super("JTokeniser", MaryDataType.RAWMARYXML, MaryDataType.TOKENS, new Locale(locale));
}
public JTokeniser(Locale locale) {
this(MaryDataType.RAWMARYXML, MaryDataType.TOKENS, locale);
}
public JTokeniser(MaryDataType inputType, MaryDataType outputType, Locale locale) {
super("JTokeniser", inputType, outputType, locale);
// Which language to use in the Tokenizer?
if (locale == null) {
// if locale == null, use English tokeniser as default/fallback tokeniser
jtokLocale = "en";
} else {
jtokLocale = locale.getLanguage();
}
}
/**
* Set the tokenizer language to be different from the Locale of the module. This can be useful when reusing another
* language's tokenizer data.
*
* @param languageCode
* the language-code to use, as a two-character string such as "de" or "en".
*/
protected void setTokenizerLanguage(String languageCode) {
jtokLocale = languageCode;
}
public void startup() throws Exception {
super.startup();
Properties jtokProperties = new Properties();
jtokProperties.setProperty("languages", jtokLocale);
jtokProperties.setProperty(jtokLocale, "jtok/" + jtokLocale);
jtok = new JTok(jtokProperties);
}
public MaryData process(MaryData d) throws Exception {
Document doc = d.getDocument();
// The challenge in this module is that the tokeniser needs a plain text
// version of this XML document, but we need to feed its output
// back into the XML document.
// Solution strategy: Remember the alignment of text with XML as an
// annotated string. Each stretch of characters is annotated with the
// DOM Text object to which it belonged.
NodeIterator ni = ((DocumentTraversal) doc).createNodeIterator(doc, NodeFilter.SHOW_TEXT, null, false);
Text textNode;
StringBuilder inputText = new StringBuilder();
// Keep this loop in sync with the second loop, below:
while ((textNode = (Text) ni.nextNode()) != null) {
String text = textNode.getData().trim();
// text = text.replaceAll("\\.", " \\.");
if (text.length() == 0)
continue;
// Insert a space character between non-punctuation characters:
if (inputText.length() > 0 && !Character.isWhitespace(inputText.charAt(inputText.length() - 1))
&& Character.isLetterOrDigit(text.charAt(0))) {
inputText.append(" ");
}
inputText.append(text);
}
FastAnnotatedString maryText = new FastAnnotatedString(inputText.toString());
// And now go through the TEXT nodes a second time and annotate
// the string.
ni = ((DocumentTraversal) doc).createNodeIterator(doc, NodeFilter.SHOW_TEXT, null, false);
int pos = 0;
// Keep this loop in sync with the first loop, above:
while ((textNode = (Text) ni.nextNode()) != null) {
String text = textNode.getData().trim();
// text = text.replaceAll("\\.", " \\.");
int len = text.length();
if (len == 0)
continue;
// Skip the space character between non-punctuation characters:
if (pos > 0 && !Character.isWhitespace(inputText.charAt(pos - 1)) && Character.isLetterOrDigit(text.charAt(0))) {
pos++;
}
maryText.annotate("MARYXML", textNode, pos, pos + len);
pos += len;
}
// Now maryText is the input text annotated with the Text nodes in the
// MaryXML document.
// Tokenise:
AnnotatedString tokenisedText;
tokenisedText = jtok.tokenize(inputText.toString(), jtokLocale);
// System.err.println("maryText: `" + maryText.toString() + "'");
// System.err.println("tokenisedText: `" + tokenisedText.toString() + "'");
// assert tokenisedText.toString().equals(maryText.toString());
// And now merge the output back into the MaryXML document.
Element firstTokenInSentence = null;
Element firstTokenInParagraph = null;
Element previousToken = null;
Text currentTextNode = null;
char c = tokenisedText.setIndex(0);
maryText.setIndex(0);
while (c != AnnotatedString.DONE) {
int tokenStart = tokenisedText.getRunStart(JTok.CLASS_ANNO);
int tokenEnd = tokenisedText.getRunLimit(JTok.CLASS_ANNO);
// check if c belongs to a token
if (null != tokenisedText.getAnnotation(JTok.CLASS_ANNO)) {
// We don't care about the actual annotation, only that there is one.
// Where to insert the token:
maryText.setIndex(tokenStart);
Text tn = (Text) maryText.getAnnotation("MARYXML");
assert tn != null;
// Only create new token tag if there was none around the corresponding
// text node in the input:
Element t = null;
if (MaryDomUtils.hasAncestor(tn, MaryXML.TOKEN)) {
t = (Element) MaryDomUtils.getAncestor(tn, MaryXML.TOKEN);
} else { // need to create a new element:
String token = tokenisedText.substring(tokenStart, tokenEnd);
t = MaryXML.createElement(doc, MaryXML.TOKEN);
MaryDomUtils.setTokenText(t, token);
// Insert the new token element
tn.getParentNode().insertBefore(t, tn);
}
// If we have by now moved past currentTextNode, delete it
// (unless it was already inside a t tag):
if (currentTextNode != null && currentTextNode != tn && !MaryDomUtils.hasAncestor(currentTextNode, MaryXML.TOKEN)) {
currentTextNode.getParentNode().removeChild(currentTextNode);
}
currentTextNode = tn;
// Is this token the first in a new sentence or paragraph?
if (null != tokenisedText.getAnnotation(JTok.BORDER_ANNO)) {
// Yes, this is the first token of a new sentence
// Need to enclose previous sentence with an tag?
if (firstTokenInSentence != null) {
assert previousToken != null;
if (!MaryDomUtils.hasAncestor(firstTokenInSentence, MaryXML.SENTENCE)
&& !MaryDomUtils.hasAncestor(previousToken, MaryXML.SENTENCE)) {
Element firstPara = (Element) MaryDomUtils.getAncestor(firstTokenInSentence, MaryXML.PARAGRAPH);
Element lastPara = (Element) MaryDomUtils.getAncestor(previousToken, MaryXML.PARAGRAPH);
if (firstPara == null && lastPara == null || firstPara.equals(lastPara)) {
// Either both tokens have no surrounding paragraph, or both have
// the same.
encloseWithSentence(firstTokenInSentence, previousToken);
}
}
}
firstTokenInSentence = null;
// Is it even a paragraph start?
if (tokenisedText.getAnnotation(JTok.BORDER_ANNO) == JTok.P_BORDER) {
// Need to enclose previous paragraph with a tag?
if (firstTokenInParagraph != null) {
assert previousToken != null;
if (!MaryDomUtils.hasAncestor(firstTokenInParagraph, MaryXML.PARAGRAPH)
&& !MaryDomUtils.hasAncestor(previousToken, MaryXML.PARAGRAPH)) {
DomUtils.encloseNodesWithNewElement(
DomUtils.getAncestor(firstTokenInParagraph, MaryXML.SENTENCE),
DomUtils.getAncestor(previousToken, MaryXML.SENTENCE), MaryXML.PARAGRAPH);
}
}
firstTokenInParagraph = null;
}
}
previousToken = t;
if (firstTokenInSentence == null)
firstTokenInSentence = t;
if (firstTokenInParagraph == null)
firstTokenInParagraph = t;
}
c = tokenisedText.setIndex(tokenEnd);
maryText.setIndex(tokenEnd);
}
// Remove the last old text node
if (currentTextNode != null // should be
&& !MaryDomUtils.hasAncestor(currentTextNode, MaryXML.TOKEN))
currentTextNode.getParentNode().removeChild(currentTextNode);
// Insert the last element
if (firstTokenInSentence != null) {
assert previousToken != null;
if (!MaryDomUtils.hasAncestor(firstTokenInSentence, MaryXML.SENTENCE)
&& !MaryDomUtils.hasAncestor(previousToken, MaryXML.SENTENCE)) {
Element firstPara = (Element) MaryDomUtils.getAncestor(firstTokenInSentence, MaryXML.PARAGRAPH);
Element lastPara = (Element) MaryDomUtils.getAncestor(previousToken, MaryXML.PARAGRAPH);
if (firstPara == null && lastPara == null || firstPara.equals(lastPara)) {
// Either both tokens have no surrounding paragraph, or both have
// the same.
encloseWithSentence(firstTokenInSentence, previousToken);
}
}
}
// Insert the last
element
if (firstTokenInParagraph != null) {
assert previousToken != null;
if (!MaryDomUtils.hasAncestor(firstTokenInParagraph, MaryXML.PARAGRAPH)
&& !MaryDomUtils.hasAncestor(previousToken, MaryXML.PARAGRAPH)) {
DomUtils.encloseNodesWithNewElement(DomUtils.getAncestor(firstTokenInParagraph, MaryXML.SENTENCE),
DomUtils.getAncestor(previousToken, MaryXML.SENTENCE), MaryXML.PARAGRAPH);
}
}
// And finally, we make sure for every token that it is not longer than TOKEN_MAXLENGTH:
NodeIterator tIt = MaryDomUtils.createNodeIterator(doc.getDocumentElement(), MaryXML.TOKEN);
Element t = null;
while ((t = (Element) tIt.nextNode()) != null) {
String tokenText = MaryDomUtils.tokenText(t);
if (tokenText.length() > TOKEN_MAXLENGTH) {
String cutTT = tokenText.substring(0, TOKEN_MAXLENGTH);
logger.info("Cutting exceedingly long input token (length " + tokenText.length() + " ) to length "
+ TOKEN_MAXLENGTH + ":\n" + "before: " + tokenText + "\nafter: " + cutTT);
MaryDomUtils.setTokenText(t, cutTT);
}
}
MaryData result = new MaryData(outputType(), d.getLocale());
result.setDocument(doc);
return result;
}
/**
* @param firstTokenInSentence
* @param lastTokenInSentence
*/
private void encloseWithSentence(Element firstTokenInSentence, Element lastTokenInSentence) {
// Allow for sentences to start with a element:
Element encloseFromHere = firstTokenInSentence;
Element maybeBoundary = DomUtils.getPreviousSiblingElement(firstTokenInSentence);
if (maybeBoundary != null && maybeBoundary.getTagName().equals(MaryXML.BOUNDARY)) {
encloseFromHere = maybeBoundary;
}
Element encloseToHere = lastTokenInSentence;
maybeBoundary = DomUtils.getNextSiblingElement(lastTokenInSentence);
if (maybeBoundary != null && maybeBoundary.getTagName().equals(MaryXML.BOUNDARY)) {
encloseToHere = maybeBoundary;
}
DomUtils.encloseNodesWithNewElement(encloseFromHere, encloseToHere, MaryXML.SENTENCE);
}
}