marytts.modules.ProsodyGeneric Maven / Gradle / Ivy
/**
* Copyright 2000-2006 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see .
*
*/
package marytts.modules;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
import java.util.Properties;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.FactoryConfigurationError;
import javax.xml.parsers.ParserConfigurationException;
import marytts.datatypes.MaryData;
import marytts.datatypes.MaryDataType;
import marytts.datatypes.MaryXML;
import marytts.exceptions.MaryConfigurationException;
import marytts.exceptions.NoSuchPropertyException;
import marytts.server.MaryProperties;
import marytts.util.dom.DomUtils;
import marytts.util.dom.MaryDomUtils;
import marytts.util.dom.NameNodeFilter;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.traversal.DocumentTraversal;
import org.w3c.dom.traversal.NodeFilter;
import org.w3c.dom.traversal.NodeIterator;
import org.w3c.dom.traversal.TreeWalker;
/**
* The generic prosody module.
*
* @author Stephanie Becker
*/
public class ProsodyGeneric extends InternalModule {
protected String paragraphDeclination; // name of the config file entry
protected boolean applyParagraphDeclination; // specified per language in mary config files
protected String syllableAccents; // specified in mary config files: ToBI accents on words or syllables?
protected boolean accentedSyllables;
// path to accentPriorities file(contains attribute values(f.e. part of speechs)
// and and a number which reflects the probability for their accentuation) specified in maryrc file
protected String accentPriorities;
protected Properties priorities;
protected String tobiPredFilename; // xml rule file for prosody prediction
protected HashMap tobiPredMap = new HashMap(); // map that will be filled with the rules
protected HashMap listMap = new HashMap(); // map that will contain the lists defined in the
// xml rule file
private boolean convertToBI2Contour;
protected HashMap toBI2ContourMap;
public ProsodyGeneric() {
this((Locale) null);
}
public ProsodyGeneric(MaryDataType inputType, MaryDataType outputType, Locale locale, String tobipredFileName,
String accentPriorities, String syllableAccents, String paragraphDeclination) {
super("Prosody", inputType, outputType, locale);
this.tobiPredFilename = tobipredFileName;
this.accentPriorities = accentPriorities;
this.syllableAccents = syllableAccents;
this.paragraphDeclination = paragraphDeclination;
}
public ProsodyGeneric(String locale, String propertyPrefix) {
this(new Locale(locale), propertyPrefix);
}
public ProsodyGeneric(Locale locale, String propertyPrefix) {
super("Prosody", MaryDataType.PHONEMES, MaryDataType.INTONATION, locale);
this.tobiPredFilename = propertyPrefix + "tobipredparams";
this.accentPriorities = propertyPrefix + "accentPriorities";
this.syllableAccents = propertyPrefix + "syllableaccents";
this.paragraphDeclination = propertyPrefix + "paragraphdeclination";
}
public ProsodyGeneric(String locale) {
this(new Locale(locale), "fallback.prosody.");
}
public ProsodyGeneric(Locale locale) {
this(locale, "fallback.prosody.");
}
public void startup() throws Exception {
priorities = new Properties();
if (accentPriorities != null) {
InputStream accentStream = MaryProperties.needStream(accentPriorities);
try {
priorities.load(accentStream);
} catch (IOException e) {
throw new MaryConfigurationException("can't load accent priorities from "
+ MaryProperties.getProperty(accentPriorities), e);
} finally {
accentStream.close();
}
}
if (syllableAccents != null) {
accentedSyllables = MaryProperties.getBoolean(syllableAccents);
} else {
accentedSyllables = false;
}
if (paragraphDeclination != null) {
applyParagraphDeclination = MaryProperties.getBoolean(paragraphDeclination);
} else {
applyParagraphDeclination = false;
}
try {
loadTobiPredRules(); // fill the rule map
buildListMap(); // fill the list map
} catch (Exception e) {
throw new MaryConfigurationException("Can't fill prosody maps ", e);
}
convertToBI2Contour = MaryProperties.getBoolean("prosody.convertToBI2Contour", false);
if (convertToBI2Contour) {
boolean externalToBI2Contour = MaryProperties.getBoolean("prosody.externalToBI2Contour", false);
if (externalToBI2Contour) {
String externalFileName = MaryProperties.getFilename("prosody.ToBI2ContourMapFile");
try {
toBI2ContourMap = getToBI2ContourMap(externalFileName);
} catch (IOException e) {
throw new MaryConfigurationException("can't read ToBI2Contour lookup file: " + externalFileName, e);
}
} else {
toBI2ContourMap = getToBI2ContourMap();
}
}
super.startup();
}
protected synchronized void loadTobiPredRules() throws FactoryConfigurationError, ParserConfigurationException,
org.xml.sax.SAXException, IOException, NoSuchPropertyException, MaryConfigurationException {
// parsing the xml rule file
DocumentBuilderFactory f = DocumentBuilderFactory.newInstance();
f.setValidating(false);
DocumentBuilder b = f.newDocumentBuilder();
InputStream tobiruleStream = MaryProperties.needStream(tobiPredFilename);
Document tobiPredRules = null;
try {
tobiPredRules = b.parse(tobiruleStream);
} finally {
tobiruleStream.close();
}
Element root = tobiPredRules.getDocumentElement();
for (Element e = MaryDomUtils.getFirstChildElement(root); e != null; e = MaryDomUtils.getNextSiblingElement(e)) { // HashMap
// with
// 4
// entries
if (e.getTagName().equals("definitions")) { // list defintions
tobiPredMap.put("definitions", e);
}
if (e.getTagName().equals("accentposition")) { // these rules determine which words receive accents
tobiPredMap.put("accentposition", e);
}
if (e.getTagName().equals("accentshape")) { // these rules determine which type of accent a word receives
tobiPredMap.put("accentshape", e);
}
if (e.getTagName().equals("boundaries")) { // these rules determine locatian and type of boundaries
tobiPredMap.put("boundaries", e);
}
}
}
protected synchronized void buildListMap() throws IOException {
Element listDefinitions = null;
listDefinitions = (Element) tobiPredMap.get("definitions");
// search for entries with tag "list"
TreeWalker tw = ((DocumentTraversal) listDefinitions.getOwnerDocument()).createTreeWalker(listDefinitions,
NodeFilter.SHOW_ELEMENT, new NameNodeFilter(new String[] { "list" }), false);
Element list = null;
while ((list = (Element) tw.nextNode()) != null) {
String name = list.getAttribute("name"); // list name
if (list.hasAttribute("items")) { // list is defined in the xml file (no external list)
String items = list.getAttribute("items");
HashSet itemSet = new HashSet(); // build a set with the elements in the list
StringTokenizer st;
if (items.contains(" ")) {
st = new StringTokenizer(items, " ");
} else {
st = new StringTokenizer(items, ":");
}
while (st.hasMoreTokens()) {
itemSet.add(st.nextToken());
}
listMap.put(name, itemSet); // put the set on the map
}
if (list.hasAttribute("file")) { // external list definition
String fileName = list.getAttribute("file");
listMap.put(name, readListFromResource(fileName));
}
}
}
/**
* Read a list from an external file. This generic implementation can read from text files (filenames ending in
* .txt
). Subclasses may override this class to provide additional file formats. They must make sure that
* checkList()
can deal with all list formats.
*
* @param resourceName
* resource file in classpath from which to read the list; suffix identifies list format.
* @return An Object representing the list; checkList() must be able to make sense of this. This base implementation returns a
* Set<String>.
* @throws IllegalArgumentException
* if the fileName suffix cannot be identified as a list file format.
* @throws IOException
* if the file given in fileName cannot be found or read from
*/
protected Object readListFromResource(String resourceName) throws IOException {
String suffix = resourceName.substring(resourceName.length() - 4, resourceName.length());
if (suffix.equals(".txt")) { // txt file
InputStream resourceStream = this.getClass().getResourceAsStream("prosody/" + resourceName);
// build a set that contains every word contained in the
// external text file
HashSet listSet = new HashSet();
BufferedReader in = new BufferedReader(new InputStreamReader(resourceStream, "UTF-8"));
while (in.ready()) {
String line = in.readLine();
listSet.add(line);
}
in.close();
return listSet; // put the set on the map
} else {
throw new IllegalArgumentException("Unknown list file format: " + suffix);
}
}
public MaryData process(MaryData d) throws Exception {
Document doc = d.getDocument();
// get the sentences
NodeIterator sentenceIt = ((DocumentTraversal) doc).createNodeIterator(doc.getDocumentElement(), NodeFilter.SHOW_ELEMENT,
new NameNodeFilter(MaryXML.SENTENCE), false);
Element sentence = null;
while ((sentence = (Element) sentenceIt.nextNode()) != null) {
// And now the actual processing
logger.debug("Processing next sentence");
processSentence(sentence);
}
if (accentedSyllables) {
copyAccentsToSyllables(doc); // ToBI accents on syllables or words?
}
if (applyParagraphDeclination) {
NodeList paragraphs = doc.getElementsByTagName(MaryXML.PARAGRAPH);
for (int i = 0; i < paragraphs.getLength(); i++) {
Element paragraph = (Element) paragraphs.item(i);
NodeList phrases = paragraph.getElementsByTagName(MaryXML.PHRASE);
int steps = phrases.getLength();
if (steps <= 1)
continue;
for (int j = 0; j < steps; j++) {
// Paragraph intonation: embed each in a
// element simulating a paragraph-wide declination phenomenon
// superimposed to the phrase-internal declination.
int pitchDiff = 10; // difference in percent between first and last phrase
int rangeDiff = 40; // difference in percent between first and last phrase
double factor = (0.5 - j / (steps - 1f));
int pitchValue = (int) (pitchDiff * factor);
String pitchString = (pitchValue >= 0 ? "+" : "") + pitchValue + "%";
int rangeValue = (int) (rangeDiff * factor);
String rangeString = (rangeValue >= 0 ? "+" : "") + rangeValue + "%";
Element phrase = (Element) phrases.item(j);
Element prosody = MaryXML.createElement(phrase.getOwnerDocument(), MaryXML.PROSODY);
phrase.getParentNode().insertBefore(prosody, phrase);
prosody.appendChild(phrase);
prosody.setAttribute("pitch", pitchString);
prosody.setAttribute("range", rangeString);
}
}
}
if (convertToBI2Contour) {
convertTOBIAccents2ProsodyContour(doc);
}
MaryData result = new MaryData(outputType(), d.getLocale());
result.setDocument(doc);
return result;
}
/**
* To convert all TOBI accents given in MARYXML document to suitable pitch contour shapes:
*
* e.g. Input : ball
*
* Output: ball
*
*
* @param doc
* Document
* @throws Exception
* when XML processing fails
*/
private void convertTOBIAccents2ProsodyContour(Document doc) throws Exception {
TreeWalker tw = MaryDomUtils.createTreeWalker(doc, MaryXML.TOKEN);
Element tokenElement = (Element) tw.nextNode();
while (tokenElement != null) {
boolean hasAccentAttribute = tokenElement.hasAttribute("accent");
if (hasAccentAttribute) {
String accentAttribute = tokenElement.getAttribute("accent");
boolean isDefined = this.toBI2ContourMap.containsKey(accentAttribute);
if (!isDefined) {
tokenElement = (Element) tw.nextNode();
continue;
}
String contourValue = this.toBI2ContourMap.get(accentAttribute);
assert contourValue != null : "contour attribute should not be null";
Node tokenAncestor = tokenElement.getParentNode();
Element prosody = MaryXML.createElement(doc, MaryXML.PROSODY);
prosody.setAttribute("contour", contourValue);
prosody.appendChild(tokenElement.cloneNode(true));
tokenAncestor.insertBefore(prosody, tokenElement);
Element nextTokenElement = (Element) tw.nextNode();
if (nextTokenElement == null) {
tokenAncestor.removeChild(tokenElement);
break;
}
tokenAncestor.removeChild(tokenElement);
tokenElement = nextTokenElement;
continue;
}
tokenElement = (Element) tw.nextNode();
}
}
/**
* To verify whether the 'accent' contour shape defined or not
*
* @param accentAttribute
* - TOBI accent
* @return true if given accent defined false if given accent not defined
*/
@Deprecated
private boolean isDefinedAccent(String accentAttribute) {
if ("H*".equals(accentAttribute))
return true;
if ("L*".equals(accentAttribute))
return true;
if ("L*+H".equals(accentAttribute))
return true;
if ("L*+!H".equals(accentAttribute))
return true;
if ("L+H*".equals(accentAttribute))
return true;
if ("!H*".equals(accentAttribute))
return true;
return false;
}
/**
* A method to return pitch contour specification for given 'accent' which maintains a 'accent' to 'contour' lookup Note: If
* you add new accent pitch contour shape into lookup, do not forget to define in method isDefinedAccent(..)
*
* @param accentAttribute
* - TOBI accent
* @return A suitable pitch contour specification for the given 'accent' or null if not defined in lookup
*/
@Deprecated
private String getAccentContour(String accentAttribute) {
if ("H*".equals(accentAttribute))
return "(4%, +10%)(18%,+20%)(34%,+26%)(50%,+30%)(66%,+26%)(82%,+20%)(96%,+10%)";
if ("L*".equals(accentAttribute))
return "(4%, -10%)(18%,-20%)(34%,-26%)(50%,-30%)(66%,-26%)(82%,-20%)(96%,-10%)";
if ("L*+H".equals(accentAttribute))
return "(2%, -7%)(18%,-16%)(34%,-19%)(50%,-20%)(66%,-15%)(82%,-4%)(100%,+25%)";
if ("L*+!H".equals(accentAttribute))
return "(0%, +5%)(4%, -7%)(18%,-16%)(34%,-20%)(48%, -7%)(52%, +10%)(66%,+20%)(82%,+26%)(100%,+30%)";
if ("L+H*".equals(accentAttribute))
return "(0%, -20%)(18%,-19%)(34%,-17%)(45%, -7%)(55%, +10%)(66%,+25%)(82%,+28%)(100%,+30%)";
if ("!H*".equals(accentAttribute))
return "(0%, +30%)(18%,+10%)(34%,+5%)(50%,0%)(66%,-13%)(82%,-17%)(100%,-20%)";
return null;
}
/**
* Get a default ToBI to Contour Map
*
* @return HashMap TOBI to contour map
*/
private HashMap getToBI2ContourMap() {
HashMap map = new HashMap();
map.put("H*", "(4%, +10%)(18%,+20%)(34%,+26%)(50%,+30%)(66%,+26%)(82%,+20%)(96%,+10%)");
map.put("L*", "(4%, -10%)(18%,-20%)(34%,-26%)(50%,-30%)(66%,-26%)(82%,-20%)(96%,-10%)");
map.put("L*+H", "(2%, -7%)(18%,-16%)(34%,-19%)(50%,-20%)(66%,-15%)(82%,-4%)(100%,+25%)");
map.put("L*+!H", "(0%, +5%)(4%, -7%)(18%,-16%)(34%,-20%)(48%, -7%)(52%, +10%)(66%,+20%)(82%,+26%)(100%,+30%)");
map.put("L+H*", "(0%, -20%)(18%,-19%)(34%,-17%)(45%, -7%)(55%, +10%)(66%,+25%)(82%,+28%)(100%,+30%)");
map.put("!H*", "(0%, +30%)(18%,+10%)(34%,+5%)(50%,0%)(66%,-13%)(82%,-17%)(100%,-20%)");
return map;
}
/**
* Read pitch contour specification into a map which maintains a 'accent' to 'contour' lookup
*
* @param externalFileName
* external lookup file
* @return HashMap lookup map
* @throws IOException
* when reading external file fails
*/
private HashMap getToBI2ContourMap(String externalFileName) throws IOException {
BufferedReader bfr = new BufferedReader(new FileReader(new File(externalFileName)));
HashMap map = new HashMap();
String line;
while ((line = bfr.readLine()) != null) {
line = line.trim();
// skip lines start with '#'
if ("".equals(line) || line.startsWith("#")) {
continue;
}
if (line.contains("|")) {
String[] words = line.split("\\|");
if (words.length == 2) {
map.put(words[0], words[1]);
}
}
}
return map;
}
protected void processSentence(Element sentence) {
NodeList tokens = sentence.getElementsByTagName(MaryXML.TOKEN);
if (tokens.getLength() < 1) {
return; // no tokens -- what can we do?
}
Element firstTokenInPhrase = null;
// properties of the whole sentence
// first determine the sentence type
String sentenceType = "decl";
sentenceType = getSentenceType(tokens);
// determine if it is the last sentence in a paragraph
boolean paragraphFinal = MaryDomUtils.isLastOfItsKindIn(sentence, MaryXML.PARAGRAPH)
&& !MaryDomUtils.isFirstOfItsKindIn(sentence, MaryXML.PARAGRAPH);
// check if it is a sentence with vorfeld
boolean inVorfeld = true; // default
for (int i = 0; i < tokens.getLength(); i++) { // search for the first word in sentence
Element token = ((Element) tokens.item(i));
if (!token.getAttribute("ph").equals("")) { // first word found
String posFirstWord = token.getAttribute("pos");
// if pos value of first word in sentence is contained in set noVorfeld, vorfeld doens't exist
Set noVorfeld = (Set) listMap.get("noVorfeld");
if (noVorfeld != null) {
if (noVorfeld.contains(posFirstWord)) {
inVorfeld = false;
}
}
break;
}
}
// default: no special position
String specialPositionType = "noValue"; // can get the values "endofvorfeld" and "endofpar"(=end of paragraph)
int numEndOfVorfeld = -1;
boolean hasAccent = false; // tests if phrase has an accent
Element bestCandidate = null; // will become token with highest accent priority if a phrase has no accent;
// avoids phrases without accent
// loop over the tokens in sentence
// assignment of accent position and boundaries
for (int i = 0; i < tokens.getLength(); i++) {
Element token = (Element) tokens.item(i);
logger.debug("Now looking at token `" + MaryDomUtils.tokenText(token) + "'");
if (firstTokenInPhrase == null) {
firstTokenInPhrase = token; // begin of an intonation phrase
}
// determine if token is at end of vorfeld
if (inVorfeld) { // only if vorfeld exists and if token's position is not after vorfeld
if (i < tokens.getLength() - 1) {
Element nextToken = (Element) tokens.item(i + 1);
String posNextToken = nextToken.getAttribute("pos");
// if pos value of next token is contained in set beginOfMittelfeld,
// current token is at the end of the vorfeld
Set beginOfMittelfeld = (Set) listMap.get("beginOfMittelfeld");
if (beginOfMittelfeld != null && beginOfMittelfeld.contains(posNextToken)) {
// for(int z=0; z= tokens.getLength() - 1); // last token in sentence?
if (paragraphFinal && isFinalToken) { // last token in sentence and in paragraph?
specialPositionType = "endofpar";
}
boolean applyRules = applyRules(token);
if (applyRules) { // rule application not turned off
// first: assignment of accent = "tone", accent="force"(for force-accents(Druckakzent)) or accent=""
// --> determine if the token receives an accent or not
// the type of accent(f.e. L+H*) is assigend later
/*** begin user input check,accent position ***/
String forceAccent = getForceAccent(token);
if (token.getAttribute("accent").equals("unknown") || !token.hasAttribute("accent")
&& (forceAccent.equals("word") || forceAccent.equals("syllable"))) {
setAccent(token, "tone"); // the token receives an accent according to user input
} else if (token.getAttribute("accent").equals("none") || forceAccent.equals("none")) {
// no accent according to user input
} else if (!token.getAttribute("accent").equals("")) {
// accent type is already assigned by the user, f.e. accent="L+H*"
/*** end user input check, accent position ***/
// no user input
// the rules in the xml file are applied
} else if (token.getAttribute("ph").equals("")) { // test if token is punctuation
token.removeAttribute("accent"); // doesn't receive an accent
} else { // default behaviour: determine by rule whether to assign an accent
getAccentPosition(token, tokens, i, sentenceType, specialPositionType);
}
// check if the phrase has an accent (avoid intermediate phrases without accent)
if (token.hasAttribute("accent") && !token.getAttribute("accent").equals("none")) {
hasAccent = true;
}
// if not, check if current token is the best candidate
if (!hasAccent && !(token.getAttribute("accent").equals("none") || forceAccent.equals("none"))
&& !token.getAttribute("ph").equals("")) {
if (bestCandidate == null) { // no candidate yet
bestCandidate = token;
} else {
int priorToken = -1;
int priorBestCandidate = -1;
// search for pos in accentPriorities property list
// first check priority for current token
String posCurrentToken = token.getAttribute("pos");
try {
priorToken = Integer.parseInt(priorities.getProperty(posCurrentToken));
} catch (NumberFormatException e) {
}
// now check priority for bestCandidate
String posBestCandidate = bestCandidate.getAttribute("pos");
try {
priorBestCandidate = Integer.parseInt(priorities.getProperty(posBestCandidate));
} catch (NumberFormatException e) {
}
// if the current token has higher priority than the best candidate,
// current token becomes the best candidate for accentuation
if (priorToken != -1 && priorBestCandidate != -1) {
if (priorToken <= priorBestCandidate)
bestCandidate = token;
}
}
}
if (token.getAttribute("accent").equals("none") || forceAccent.equals("none")) {
token.removeAttribute("accent");
}
} // end of accent position assignment
// now the informations relevant only for boundary assignment
boolean invalidXML = false;
if (!isFinalToken) { // We only set a majorIP boundary if the XML structure
// allows the phrase to be closed before the next token
invalidXML = MaryDomUtils.isAncestor(MaryDomUtils.closestCommonAncestor(firstTokenInPhrase, tokens.item(i)),
MaryDomUtils.closestCommonAncestor(tokens.item(i), tokens.item(i + 1)));
}
if (applyRules) {
// insertion of ip- and IP-boundaries
// returns value for firstTokenInPhrase(begin of new phrase): if a boundary was inserted, firstTokenInPhrase gets
// null
// if not, firstTokenInPhrase has the same value as before
firstTokenInPhrase = getBoundary(token, tokens, i, sentenceType, specialPositionType, invalidXML,
firstTokenInPhrase);
// check if every intermediate an intonation phrase has at least one accent
// first check if a boundary was inserted
Element boundary = null;
Document doc = token.getOwnerDocument();
TreeWalker tw = ((DocumentTraversal) doc).createTreeWalker(DomUtils.getAncestor(token, MaryXML.SENTENCE),
NodeFilter.SHOW_ELEMENT, new NameNodeFilter(new String[] { MaryXML.BOUNDARY, MaryXML.TOKEN }), false);
tw.setCurrentNode(token);
logger.debug("Starting treewalker at token " + MaryDomUtils.tokenText(token));
Element next = (Element) tw.nextNode();
if (next != null && next.getTagName().equals(MaryXML.BOUNDARY)) {
logger.debug("tw found a boundary");
boundary = next;
int bi = 0;
try {
bi = Integer.parseInt(boundary.getAttribute("breakindex"));
} catch (NumberFormatException nfe) {
}
if (bi >= 3) { // is it an intermediate or an intoantion phrase?
if (!hasAccent && bestCandidate != null) { // no accent!
setAccent(bestCandidate, "tone"); // best candidate receives accent
}
hasAccent = false;
bestCandidate = null;
}
}
}
if (specialPositionType.equals("endofvorfeld"))
specialPositionType = "noValue";
} // loop tokens for accent position and boundary assignment
/*** user input check, boundaries ***/
NodeList boundaries = sentence.getElementsByTagName(MaryXML.BOUNDARY);
for (int i = 0; i < boundaries.getLength(); i++) {
Element boundary = (Element) boundaries.item(i);
if (boundary.getAttribute("breakindex").equals("none")) { // the boundary is to be deleted
// delete boundary
Node parent = boundary.getParentNode();
parent.removeChild(boundary);
} else if (boundary.getAttribute("tone").equals("unknown")) { // boundary, but no tone is given
// is there a preferred tone for boundaries?
Element prosody = MaryDomUtils.getClosestAncestorWithAttribute(boundary, MaryXML.PROSODY,
"preferred-boundary-type");
String preferred = null;
if (prosody != null)
preferred = prosody.getAttribute("preferred-boundary-type");
String h = boundary.getAttribute("breakindex");
int bi = 0;
String tone = null;
try {
bi = Integer.parseInt(h);
} catch (NumberFormatException e) {
} // ignore invalid values
if (bi >= 4) {
// major boundary (but we cannot insert a phrase,
// because we don't know where it should start)
if (preferred != null) {
if (preferred.equals("high")) {
Set set = (Set) listMap.get("high_major_boundary");
Iterator it = set.iterator();
while (it.hasNext())
tone = it.next();
} else { // low
Set set = (Set) listMap.get("low_major_boundary");
Iterator it = set.iterator();
while (it.hasNext())
tone = it.next();
}
} else { // there isn't any information about the tone, so we use default values specified in the xml file
if (i == boundaries.getLength() - 1) { // final boundary
if (sentenceType.equals("decl") || sentenceType.equals("excl")) { // declarative or exclamative
// sentence
Set set = (Set) listMap.get("default_IP_endOfSent");
Iterator it = set.iterator();
while (it.hasNext())
tone = (String) it.next();
} else {
Set set = (Set) listMap.get("default_IP_endOfInterrogSent"); // interrogative
Iterator it = set.iterator();
while (it.hasNext())
tone = (String) it.next();
}
} else { // non-final boundary
Set set = (Set) listMap.get("default_IP_midOfSent");
Iterator it = set.iterator();
while (it.hasNext())
tone = (String) it.next();
}
}
} else if (bi == 3) {
// minor boundary
if (preferred != null) {
if (preferred.equals("high")) {
Set set = (Set) listMap.get("high_minor_boundary");
Iterator it = set.iterator();
while (it.hasNext())
tone = (String) it.next();
} else { // low
Set set = (Set) listMap.get("low_minor_boundary");
Iterator it = set.iterator();
while (it.hasNext())
tone = (String) it.next();
}
} else {// there is no information about the tone, so we use the default values specified in the xml file
Set set = (Set) listMap.get("default_ip");
Iterator it = set.iterator();
while (it.hasNext())
tone = (String) it.next();
}
}
if (tone != null)
boundary.setAttribute("tone", tone);
}
} // for all boundaries
/*** end user input check, boundaries ***/
// now the information relevant for accent type assignment
boolean nucleusAssigned = false;
String lastAssignedTone = null; // for user input preferred-accent-shape="alternating_accents"
for (int j = tokens.getLength() - 1; j >= 0; j--) { // accent type assignment
Element token = (Element) tokens.item(j);
// determine specialpositionType
boolean isFinalToken = (j >= tokens.getLength() - 1); // last token in sentence?
if (paragraphFinal && isFinalToken) { // last token in paragraph?
specialPositionType = "endofpar"; // last token in sentence and in paragraph
}
if (j == numEndOfVorfeld)
specialPositionType = "endofvorfeld";
if (token.getAttribute("accent").equals("tone") || token.getAttribute("accent").equals("force")) {
/*** begin user input check, accent type ***/
Element prosody = MaryDomUtils.getClosestAncestorWithAttribute(token, MaryXML.PROSODY, "preferred-accent-shape");
if (prosody != null) {
if (token.getAttribute("accent").equals("tone")) { // no force accents in this case
String tone = null;
String preferred = prosody.getAttribute("preferred-accent-shape");
if (preferred.equals("alternating")) {
Set set = (Set) listMap.get("alternating_accents");
Iterator it = set.iterator();
while (it.hasNext()) {
String next = (String) it.next();
if (lastAssignedTone == null || !lastAssignedTone.equals(next)) {
tone = next;
}
}
} else if (preferred.equals("rising")) {
Set set = (Set) listMap.get("rising_accents");
Iterator it = set.iterator();
if (it.hasNext())
tone = (String) it.next();
} else if (preferred.equals("falling")) {
Set set = (Set) listMap.get("falling_accents");
Iterator it = set.iterator();
if (it.hasNext())
tone = (String) it.next();
}
token.setAttribute("accent", tone);
if (!nucleusAssigned)
nucleusAssigned = true;
}
} else if (!(token.getAttribute("accent").equals("force") || token.getAttribute("accent").equals("tone") || token
.getAttribute("accent").equals(""))) {
nucleusAssigned = true; // user has already assigned an accent type
/*** end user input check, accent type ***/
} else if (token.getAttribute("ph").equals("")) { // test if token is a word (no punctuation)
// punctuation, doesn't receive an accent
} else
// xml file rules are applied
// assignment of accent type
// returns true, if nuclear accent is assigned, false otherwise
nucleusAssigned = getAccentShape(token, tokens, j, sentenceType, specialPositionType, nucleusAssigned);
}
if (token.getAttribute("accent").equals("") || token.getAttribute("accent").equals("force")) {
token.removeAttribute("accent"); // if there is no accent, the accent attribute can be removed
}
if (token.hasAttribute("accent")) {
lastAssignedTone = token.getAttribute("accent");
}
} // loop over tokens for accent type assignment
} // processSentence
/**
* checks if token receives an accent or not the information is contained in the accentposition part of rules in xml file the
* token attribute "accent" receives the value "tone","force"(force accent(Druckakzent)) or ""(no accent)
*
* @param token
* (current token)
* @param tokens
* (list of all tokens in sentence)
* @param position
* (position in token list)
* @param sentenceType
* (declarative, exclamative or interrogative)
* @param specialPositionType
* (end of vorfeld or end of paragraph)
*/
protected synchronized void getAccentPosition(Element token, NodeList tokens, int position, String sentenceType,
String specialPositionType) {
String tokenText = MaryDomUtils.tokenText(token); // text of current token
Element ruleList = null;
// only the "accentposition" rules are relevant
ruleList = (Element) tobiPredMap.get("accentposition");
// search for concrete rules, with tag "rule"
TreeWalker tw = ((DocumentTraversal) ruleList.getOwnerDocument()).createTreeWalker(ruleList, NodeFilter.SHOW_ELEMENT,
new NameNodeFilter(new String[] { "rule" }), false);
boolean rule_fired = false;
String accent = ""; // default
Element rule = null;
// search for appropriate rules; the top rule has highest prority
// if a rule fires (that is: all the conditions are fulfilled),
// the accent value("tone","force" or "") is assigned and the loop stops
// if no rule is found, the accent value is ""
while (!rule_fired && (rule = (Element) tw.nextNode()) != null) {
// rule = the whole rule
// currentRulePart = part of the rule (type of condition (f.e. attributes pos="NN") or action)
Element currentRulePart = DomUtils.getFirstChildElement(rule);
while (!rule_fired && currentRulePart != null) {
boolean conditionSatisfied = false;
// if rule part with tag "action": accent assignment
if (currentRulePart.getTagName().equals("action")) {
accent = currentRulePart.getAttribute("accent");
token.setAttribute("accent", accent);
rule_fired = true;
break;
}
// check if the condition is satisfied
conditionSatisfied = checkRulePart(currentRulePart, token, tokens, position, sentenceType, specialPositionType,
tokenText);
if (!conditionSatisfied)
break; // condition violated, try next rule
// the previous conditions are satisfied --> check the next rule part
currentRulePart = DomUtils.getNextSiblingElement(currentRulePart);
} // while loop that checks the rule parts
} // while loop that checks the whole rule
}
/**
* determines accent types; tokens with accent="tone" will receive an accent type (f.e."L+H*"), accent="force" becomes "*" the
* relevant information is contained in the accentshape part of rules in xml file
*
* @param token
* (current token)
* @param tokens
* (list of all tokens in sentence)
* @param position
* position
* @param sentenceType
* (declarative, exclamative or interrogative)
* @param specialPositionType
* (position in sentence)
* @param nucleusAssigned
* (test, if nuclear accent is already assigned)
* @return nucleusAssigned
*/
protected synchronized boolean getAccentShape(Element token, NodeList tokens, int position, String sentenceType,
String specialPositionType, boolean nucleusAssigned) {
String tokenText = MaryDomUtils.tokenText(token); // text of current token
// prosodic position (prenuclear, nuclear, postnuclear)
String prosodicPositionType = null;
if (!nucleusAssigned) { // no nucleus assigned
if (token.getAttribute("accent").equals("tone")) { // current token will become the nucleus
if (specialPositionType.equals("endofpar")) {
prosodicPositionType = "nuclearParagraphFinal";
} else {
prosodicPositionType = "nuclearNonParagraphFinal";
}
} else
prosodicPositionType = "postnuclear"; // no nucleus, current token is postnuclear
} else
prosodicPositionType = "prenuclear"; // nucleus is assigned --> prenuclear
Element ruleList = null;
// only the "accentshape" rules are relevant
ruleList = (Element) tobiPredMap.get("accentshape");
// search for concrete rules (search for tag "rule")
TreeWalker tw = ((DocumentTraversal) ruleList.getOwnerDocument()).createTreeWalker(ruleList, NodeFilter.SHOW_ELEMENT,
new NameNodeFilter(new String[] { "rule" }), false);
boolean rule_fired = false;
String accent = "";
Element rule = null;
// search for appropriate rules; the top rule has highest prority
// if a rule fires (that is: all the conditions are fulfilled), the accent type (f.e. "L+H*") is assigned and the loop
// stops
// if no rule is found, the accent value is ""
while (!rule_fired && (rule = (Element) tw.nextNode()) != null) {
// rule = the whole rule
// currentRulePart = part of the rule (type of condition (f.e. attributes pos="NN") or action)
Element currentRulePart = DomUtils.getFirstChildElement(rule);
while (!rule_fired && currentRulePart != null) {
boolean conditionSatisfied = false;
// if rule part with tag "action": accent type assignment
if (currentRulePart.getTagName().equals("action")) {
accent = currentRulePart.getAttribute("accent");
token.setAttribute("accent", accent);
rule_fired = true;
if (!nucleusAssigned && !accent.equals("*")) {
nucleusAssigned = true;
}
break;
}
// check if the condition is satisfied
// special case: prosodic position (only in the accentshape rule part)
// values: prenuclear,nuclearParagraphFinal,nuclearNonParagraphFinal,postnuclear
if (currentRulePart.getTagName().equals("prosodicPosition")) {
if (!checkProsodicPosition(currentRulePart, prosodicPositionType)) {
break;
}
}
// the usual check
conditionSatisfied = checkRulePart(currentRulePart, token, tokens, position, sentenceType, specialPositionType,
tokenText);
if (!conditionSatisfied)
break; // condition violated, try next rule
// the previous conditions are satisfied --> check the next rule part
currentRulePart = DomUtils.getNextSiblingElement(currentRulePart);
}// while loop that checks the rule parts
} // while loop that checks the whole rule
return nucleusAssigned;
}
/**
* checks if a boundary is to be inserted after the current token the information is contained in the boundaries part of rules
* in xml file
*
* @param token
* (current token)
* @param tokens
* (list of tokens in sentence)
* @param position
* (position in token list)
* @param sentenceType
* (declarative, exclamative or interrogative)
* @param specialPositionType
* (endofvorfeld if sentence has vorfeld and the next token is a finite verb or end of paragraph)
* @param invalidXML
* (true if xml structure allows boundary insertion)
* @param firstTokenInPhrase
* (begin of intonation phrase)
* @return firstTokenInPhrase (if a boundary was inserted, firstTokenInPhrase gets null)
*/
protected synchronized Element getBoundary(Element token, NodeList tokens, int position, String sentenceType,
String specialPositionType, boolean invalidXML, Element firstTokenInPhrase) {
String tokenText = MaryDomUtils.tokenText(token); // text of current token
Element ruleList = null;
// only the "boundaries" rules are relevant
ruleList = (Element) tobiPredMap.get("boundaries");
// search for concrete rules (search for tag "rule")
TreeWalker tw = ((DocumentTraversal) ruleList.getOwnerDocument()).createTreeWalker(ruleList, NodeFilter.SHOW_ELEMENT,
new NameNodeFilter(new String[] { "rule" }), false);
boolean rule_fired = false;
Element rule = null;
// search for appropriate rules; the top rule has highest prority
// if a rule fires (that is: all the conditions are fulfilled), the boundary is inserted and the loop stops
while (!rule_fired && (rule = (Element) tw.nextNode()) != null) {
// rule = the whole rule
// currentRulePart = part of the rule (condition or action)
Element currentRulePart = DomUtils.getFirstChildElement(rule);
while (!rule_fired && currentRulePart != null) {
boolean conditionSatisfied = false;
// if rule part with tag "action": boundary insertion
if (currentRulePart.getTagName().equals("action")) {
int bi = Integer.parseInt(currentRulePart.getAttribute("bi"));
if (bi == 0) {
// no boundary insertion
} else if (currentRulePart.hasAttribute("tone")) {
String tone = currentRulePart.getAttribute("tone");
if (tone.endsWith("%")) {
if (!invalidXML) {
Element boundary = insertMajorBoundary(tokens, position, firstTokenInPhrase, tone, bi);
if (boundary != null)
firstTokenInPhrase = null;
}
} else if (tone.endsWith("-")) {
insertBoundary(token, tone, bi);
} else
insertBoundary(token, null, bi);
} else
insertBoundary(token, null, bi);
rule_fired = true;
break;
}
// check if the condition is satisfied
conditionSatisfied = checkRulePart(currentRulePart, token, tokens, position, sentenceType, specialPositionType,
tokenText);
if (!conditionSatisfied)
break; // condition violated, try next rule
// the previous conditions are satisfied --> check the next rule part
currentRulePart = DomUtils.getNextSiblingElement(currentRulePart);
}// while loop that checks the rule parts
} // while loop that checks the whole rule
return firstTokenInPhrase;
}
protected static final Pattern nextPlusXTextPattern = Pattern.compile("nextPlus[0-9]+Text");
protected static final Pattern previousMinusXTextPattern = Pattern.compile("previousMinus[0-9]+Text");
protected static final Pattern nextPlusXAttributesPattern = Pattern.compile("nextPlus[0-9]+Attributes");
protected static final Pattern previousMinusXAttributesPattern = Pattern.compile("previousMinus[0-9]+Attributes");
/**
* checks condition of a rule part, f.e. attributes pos="NN"
*
* @param currentRulePart
* currentRulePart
* @param token
* (current token)
* @param tokens
* (list of all tokens)
* @param position
* (position in token list)
* @param sentenceType
* (declarative, exclamative or interrogative)
* @param specialPositionType
* (special position in sentence(end of vorfeld) or text(end of paragraph))
* @param tokenText
* (text of token)
* @return true if condition is satisfied
*/
protected boolean checkRulePart(Element currentRulePart, Element token, NodeList tokens, int position, String sentenceType,
String specialPositionType, String tokenText) {
String currentRulePartTagName = currentRulePart.getTagName();
// if rule part with tag text and attribute word, check if text of token equals text in rule
if (currentRulePartTagName.equals("text") & currentRulePart.hasAttribute("word")) { // text of the token
return checkText(currentRulePart, tokenText);
}
// text of following+X token or preceding-X token
else if (currentRulePart.hasAttribute("word")
&& (currentRulePartTagName.equals("nextText") || nextPlusXTextPattern.matcher(currentRulePartTagName).find()
|| currentRulePartTagName.equals("previousText") || previousMinusXTextPattern.matcher(
currentRulePartTagName).find())) {
return checkTextOfOtherToken(currentRulePartTagName, currentRulePart, position, tokens);
}
// check number of following tokens
else if (currentRulePartTagName.equals("folTokens") && currentRulePart.hasAttribute("num")) {
return checkFolTokens(currentRulePart, position, tokens);
}
// check number of preceding tokens
else if (currentRulePartTagName.equals("prevTokens") && currentRulePart.hasAttribute("num")) {
return checkPrevTokens(currentRulePart, position, tokens);
}
// check number of following words
else if (currentRulePartTagName.equals("folWords") && currentRulePart.hasAttribute("num")) {
return checkFolWords(currentRulePart, position, tokens);
}
// check number of preceding words
else if (currentRulePartTagName.equals("prevWords") && currentRulePart.hasAttribute("num")) {
return checkPrevWords(currentRulePart, position, tokens);
}
// check sentence type (f.e. declarative sentence)
else if (currentRulePartTagName.equals("sentence") && currentRulePart.hasAttribute("type")) {
return checkSentence(currentRulePart, sentenceType);
}
// check for special position of token in sentence/text(endofvorfeld,endofpar)
else if (currentRulePartTagName.equals("specialPosition") && currentRulePart.hasAttribute("type")) {
return checkSpecialPosition(currentRulePart, specialPositionType);
}
// if rule part with tag "attributes"
// --> check the MaryXML attribute values of the token
else if (currentRulePartTagName.equals("attributes")) {
return checkAttributes(currentRulePart, token);
}
// if rule part with tag nextPlusXAttributes or previousMinusXAttributes
// --> check the MaryXML attribute values of the corresponding token
else if (currentRulePartTagName.equals("nextAttributes")
|| nextPlusXAttributesPattern.matcher(currentRulePart.getTagName()).find()
|| currentRulePartTagName.equals("previousAttributes")
|| previousMinusXAttributesPattern.matcher(currentRulePart.getTagName()).find()) {
return checkAttributesOfOtherToken(currentRulePart.getTagName(), currentRulePart, position, tokens);
} else {
// unknown rules always match
return true;
}
}
/**
* checks rule part with tag "text"; there is only the "word" attribute right now: checks if text of a token is the same as
* the value of the word attribute in the rule
*
* @param currentRulePart
* currentRulePart
* @param tokenText
* tokenText
* @return checkList(currentVal, tokenText)
*/
protected boolean checkText(Element currentRulePart, String tokenText) {
NamedNodeMap attNodes = currentRulePart.getAttributes();
for (int z = 0; z < attNodes.getLength(); z++) {
Node el = attNodes.item(z);
String currentAtt = el.getNodeName();
String currentVal = el.getNodeValue();
if (currentAtt.equals("word")) { // there is only the "word" attribute right now
if (!currentVal.startsWith("INLIST") && !currentVal.startsWith("INFSTLIST") && !currentVal.startsWith("!INLIST")
&& !currentVal.startsWith("!INFSTLIST")) { // no list
if (!currentVal.startsWith("!")) { // no negation
if (!tokenText.equals(currentVal))
return false;
} else { // negation
currentVal = currentVal.substring(1, currentVal.length());
if (tokenText.equals(currentVal))
return false;
}
} else
return checkList(currentVal, tokenText); // list
}
} // for-loop
return true;
}
/**
* checks rule part with tag "nextText","previousText","nextPlusXText" or "previousMinusXText"; there is only the "word"
* attribute right now: checks if text of a token is the same as the value of the word attribute in the rule
*
* @param tag
* tag
* @param currentRulePart
* currentRulePart
* @param position
* position
* @param tokens
* tokens
* @return checkText(currentRulePart, otherTokenText)
*/
protected boolean checkTextOfOtherToken(String tag, Element currentRulePart, int position, NodeList tokens) {
Element otherToken = null;
if (tag.equals("nextText")) { // text of next token
if (position < tokens.getLength() - 1) {
otherToken = (Element) tokens.item(position + 1);
}
}
if (nextPlusXTextPattern.matcher(tag).find()) { // text of some other token following the next token
String tempString = tag.replaceAll("nextPlus", "");
String newString = tempString.replaceAll("Text", "");
int num = Integer.parseInt(newString);
if (position < tokens.getLength() - (num + 1))
otherToken = (Element) tokens.item(position + 1 + num);
}
if (tag.equals("previousText")) { // text of previous token
if (position > 0)
otherToken = (Element) tokens.item(position - 1);
}
if (previousMinusXTextPattern.matcher(tag).find()) { // text of some other token preceding the previous token
String tempString = tag.replaceAll("previousMinus", "");
String newString = tempString.replaceAll("Text", "");
int num = Integer.parseInt(newString);
if (position > num)
otherToken = (Element) tokens.item(position - (num + 1));
}
if (otherToken == null)
return false;
String otherTokenText = MaryDomUtils.tokenText(otherToken);
return checkText(currentRulePart, otherTokenText);
}
/**
* checks rule part with tag "folTokens"; there is only the "num" attribute right now; checks if the number of the following
* tokens after the current token is the same as the value of the num attribute; f.e. the value "3+" means: at least 3
* following tokens, "3-": not more than 3, "3": exactly 3
*
* @param currentRulePart
* currentRulePart
* @param position
* position
* @param tokens
* tokens
* @return true if everything is fine
*/
protected boolean checkFolTokens(Element currentRulePart, int position, NodeList tokens) {
NamedNodeMap attNodes = currentRulePart.getAttributes();
for (int z = 0; z < attNodes.getLength(); z++) {
Node el = attNodes.item(z);
String currentAtt = el.getNodeName();
String currentVal = el.getNodeValue();
if (currentAtt.equals("num")) { // there is only the "num" attribute right now
int num = Integer.parseInt(currentVal.substring(0, 1));
int requiredLastTokenPosition = position + num;
if (currentVal.length() == 1) { // rule requires exactly num tokens after current token
if (!(tokens.getLength() - 1 == requiredLastTokenPosition))
return false;
} else if (currentVal.substring(1, 2).equals("+")) { // rule requires at least num tokens after current token
if (!(tokens.getLength() - 1 >= requiredLastTokenPosition))
return false;
} else if (currentVal.substring(1, 2).equals("-")) { // rule requires not more than num tokens after current token
if (!(tokens.getLength() - 1 <= requiredLastTokenPosition))
return false;
}
}
}
return true;
}
/**
* checks rule part with tag "prevTokens"; there is only the "num" attribute right now; checks if the number of the tokens
* preceding the current token is the same as the value of the num attribute; f.e. the value "3+" means: at least 3 preceding
* tokens, "3-": not more than 3, "3": exactly 3
*
* @param currentRulePart
* currentRulePart
* @param position
* position
* @param tokens
* tokens
* @return true if everything passes
*/
protected boolean checkPrevTokens(Element currentRulePart, int position, NodeList tokens) {
NamedNodeMap attNodes = currentRulePart.getAttributes();
for (int z = 0; z < attNodes.getLength(); z++) {
Node el = attNodes.item(z);
String currentAtt = el.getNodeName();
String currentVal = el.getNodeValue();
if (currentAtt.equals("num")) { // there is only the "num" attribute right now
int num = Integer.parseInt(currentVal.substring(0, 1));
int requiredFirstTokenPosition = position - num;
if (currentVal.length() == 1) {// rule requires exactly num tokens preceding current token
if (!(requiredFirstTokenPosition == 0))
return false;
} else if (currentVal.substring(1, 2).equals("+")) { // rule requires at least num tokens preceding current token
if (!(0 <= requiredFirstTokenPosition))
return false;
} else if (currentVal.substring(1, 2).equals("-")) { // rule requires not more than num tokens preceding current
// token
if (!(0 >= requiredFirstTokenPosition))
return false;
}
}
}
return true;
}
/**
* checks rule part with tag "folWords"; there is only the "num" attribute right now; checks if the number of the following
* words after the current token is the same as the value of the num attribute; f.e. the value "3+" means: at least 3
* following tokens, "3-": not more than 3, "3": exactly 3
*
* @param currentRulePart
* currentRulePart
* @param position
* position
* @param tokens
* tokens
* @return true if everything passes
*/
protected boolean checkFolWords(Element currentRulePart, int position, NodeList tokens) {
NamedNodeMap attNodes = currentRulePart.getAttributes();
for (int z = 0; z < attNodes.getLength(); z++) {
Node el = attNodes.item(z);
String currentAtt = el.getNodeName();
String currentVal = el.getNodeValue();
if (currentAtt.equals("num")) { // there is only the "num" attribute right now
int requiredNum = Integer.parseInt(currentVal.substring(0, 1));
int num = 0;
for (int i = position + 1; i < tokens.getLength(); i++) {
if (!((Element) tokens.item(i)).getAttribute("ph").equals(""))
num++;
}
if (currentVal.length() == 1) { // rule requires exactly num words after current token
if (num != requiredNum)
return false;
} else if (currentVal.substring(1, 2).equals("+")) { // rule requires at least num words after current token
if (!(num >= requiredNum))
return false;
} else if (currentVal.substring(1, 2).equals("-")) { // rule requires not more than num words after current token
if (!(num <= requiredNum))
return false;
}
}
}
return true;
}
/**
* checks rule part with tag "prevWords"; there is only the "num" attribute right now; checks if the number of the words
* preceding the current token is the same as the value of the num attribute; f.e. the value "3+" means: at least 3 preceding
* tokens, "3-": not more than 3, "3": exactly 3
*
* @param currentRulePart
* currentRulePart
* @param position
* position
* @param tokens
* tokens
* @return true if everything passes
*/
protected boolean checkPrevWords(Element currentRulePart, int position, NodeList tokens) {
NamedNodeMap attNodes = currentRulePart.getAttributes();
for (int z = 0; z < attNodes.getLength(); z++) {
Node el = attNodes.item(z);
String currentAtt = el.getNodeName();
String currentVal = el.getNodeValue();
if (currentAtt.equals("num")) { // there is only the "num" attribute right now
int requiredNum = Integer.parseInt(currentVal.substring(0, 1));
int num = 0;
for (int i = position - 1; i >= 0; i--) {
if (!((Element) tokens.item(i)).getAttribute("ph").equals(""))
num++;
}
if (currentVal.length() == 1) { // rule requires exactly num words after current token
if (num != requiredNum)
return false;
} else if (currentVal.substring(1, 2).equals("+")) { // rule requires at least num words after current token
if (!(num >= requiredNum))
return false;
} else if (currentVal.substring(1, 2).equals("-")) { // rule requires not more than num words after current token
if (!(num <= requiredNum))
return false;
}
}
}
return true;
}
/**
* checks rule part with tag "sentence"; there is only the "type" attribute right now: checks if sentence type of a token is
* the same as the value of the type attribute in the rule
*
* @param currentRulePart
* currentRulePart
* @param sentenceType
* sentenceType
* @return true if everything passes
*/
protected boolean checkSentence(Element currentRulePart, String sentenceType) {
NamedNodeMap attNodes = currentRulePart.getAttributes();
for (int z = 0; z < attNodes.getLength(); z++) {
Node el = attNodes.item(z);
String currentAtt = el.getNodeName();
String currentVal = el.getNodeValue();
if (currentAtt.equals("type")) { // there is only the "type" attribute right now
if (!currentVal.startsWith("!")) { // no negation
if (!sentenceType.equals(currentVal))
return false;
} else { // negation
currentVal = currentVal.substring(1, currentVal.length());
if (sentenceType.equals(currentVal))
return false;
}
}
}
return true;
}
/**
* checks rule part with tag "specialPosition"; there is only the "type" attribute right now: checks if specialPosition value
* of a token is the same as the value of the type attribute in the rule; values: endofvorfeld, endofpar (end of paragraph)
*
* @param currentRulePart
* currentRulePart
* @param specialPositionType
* specialPositionType
* @return true if everything passes
*/
protected boolean checkSpecialPosition(Element currentRulePart, String specialPositionType) {
NamedNodeMap attNodes = currentRulePart.getAttributes();
for (int z = 0; z < attNodes.getLength(); z++) {
Node el = attNodes.item(z);
String currentAtt = el.getNodeName();
String currentVal = el.getNodeValue();
if (currentAtt.equals("type")) { // there is only the "type" attribute right now
if (!currentVal.startsWith("!")) { // no negation
if (!specialPositionType.equals(currentVal))
return false;
} else { // negation
currentVal = currentVal.substring(1, currentVal.length());
if (specialPositionType.equals(currentVal))
return false;
}
}
}
return true;
}
/**
* checks rule part with tag "prosodicPosition"; there is only the "type" attribute right now: checks if prosodic position of
* a token is the same as the value of the type attribute in the rule; values: prenuclear, nuclearParagraphFinal,
* nuclearParagraphNonFinal, postnuclear
*
* @param currentRulePart
* currentRulePart
* @param prosodicPositionType
* prosodicPositionType
* @return true if everything passes
*/
protected boolean checkProsodicPosition(Element currentRulePart, String prosodicPositionType) {
NamedNodeMap attNodes = currentRulePart.getAttributes();
for (int z = 0; z < attNodes.getLength(); z++) {
Node el = attNodes.item(z);
String currentAtt = el.getNodeName();
String currentVal = el.getNodeValue();
if (currentAtt.equals("type")) { // there is only the "type" attribute right now
if (!currentVal.startsWith("!")) { // no negation
if (!prosodicPositionType.equals(currentVal))
return false;
} else { // negation
currentVal = currentVal.substring(1, currentVal.length());
if (prosodicPositionType.equals(currentVal))
return false;
}
}
}
return true;
}
/**
* checks rule part with tag "attributes"; checks if the MaryXML attributes and values of current token are the same as in the
* rule
*
* @param currentRulePart
* currentRulePart
* @param token
* token
* @return checkList(currentVal, token.getAttribute(currentAtt))
*/
protected boolean checkAttributes(Element currentRulePart, Element token) {
NamedNodeMap attNodes = currentRulePart.getAttributes();
if (token == null)
return false; // token doesn't exist
for (int z = 0; z < attNodes.getLength(); z++) { // loop over MaryXML attributes in rule part
Node el = attNodes.item(z);
String currentAtt = el.getNodeName();
String currentVal = el.getNodeValue();
// first the special cases
if (!token.hasAttribute(currentAtt)) { // token doesn't have attribute
if (currentVal.equals("!")) { // rule says that token shouldn't have it --> return true
return true;
}
// rule says that token should have it --> return false
return false;
}
// token has attribute ...
if (currentVal.equals("!")) { // .. but rule says that token shouldn't have it --> return false
return false;
}
if (currentVal.equals("")) { // rule says that value doesn't matter, but attribute has to be present --> return true
return true;
}
// first case: the value of the rule attribute is not a list
if (!currentVal.startsWith("INLIST") && !currentVal.startsWith("INFSTLIST") && !currentVal.startsWith("!INLIST")
&& !currentVal.startsWith("!INFSTLIST")) {
if (!currentVal.startsWith("!")) {
if (!token.getAttribute(currentAtt).equals(currentVal)) { // condition violated
return false;
}
} else { // value is negated --> token shouldn't have the value in currentVal
currentVal = currentVal.substring(1, currentVal.length());
if (token.getAttribute(currentAtt).equals(currentVal)) { // condition violated
return false;
}
}
} else { // second case: the value of the rule attribute is a list
return checkList(currentVal, token.getAttribute(currentAtt));
}
} // for-loop
return true;
}
/**
* checks rule part with tag "nextAttributes","previousAttributes","nextPlusXAttributes","previousMinusXAttributes"; checks if
* the MaryXML attributes and values of other token than the current one are the same as in rule (f.e. the 3th token after
* current token)
*
* @param tag
* tag
* @param currentRulePart
* currentRulePart
* @param position
* position
* @param tokens
* tokens
* @return checkAttributes(currentRulePart, otherToken)
*/
protected boolean checkAttributesOfOtherToken(String tag, Element currentRulePart, int position, NodeList tokens) {
Element otherToken = null;
if (tag.equals("nextAttributes")) { // MaryXML attributes of next token
if (position < tokens.getLength() - 1) {
otherToken = (Element) tokens.item(position + 1);
}
}
if (nextPlusXAttributesPattern.matcher(tag).find()) { // MaryXML attributes of some token following the next token
String tempString = tag.replaceAll("nextPlus", "");
String newString = tempString.replaceAll("Attributes", "");
int num = Integer.parseInt(newString);
if (position < tokens.getLength() - (num + 1)) {
otherToken = (Element) tokens.item(position + 1 + num);
}
}
if (tag.equals("previousAttributes")) { // MaryXML attributes of previous token
if (position > 0) {
otherToken = (Element) tokens.item(position - 1);
}
}
if (previousMinusXAttributesPattern.matcher(tag).find()) { // MaryXML attributes of some token preceding the previous
// token
String tempString = tag.replaceAll("previousMinus", "");
String newString = tempString.replaceAll("Attributes", "");
int num = Integer.parseInt(newString);
if (position > num) {
otherToken = (Element) tokens.item(position - (num + 1));
}
}
return checkAttributes(currentRulePart, otherToken);
}
/**
* Checks if tokenValue is contained in list. This base implementation is able to deal with list types represented as Sets;
* subclasses may override this method to be able to deal with different list representations.
*
* @param currentVal
* the condition to check; can be either INLIST:
or !INLIST:
followed by the list name to
* check.
* @param tokenValue
* value to look up in the list
* @return whether or not tokenValue is contained in the list.
*/
protected boolean checkList(String currentVal, String tokenValue) {
if (currentVal == null || tokenValue == null) {
throw new NullPointerException("Received null argument");
}
if (!currentVal.startsWith("INLIST") && !currentVal.startsWith("!INLIST")) {
throw new IllegalArgumentException("currentVal does not start with INLIST or !INLIST");
}
boolean negation = currentVal.startsWith("!");
String listName = currentVal.substring(currentVal.indexOf(":") + 1);
Object listObj = listMap.get(listName);
if (listObj == null)
return false; // no list found
boolean contains;
if (listObj instanceof Set) {
Set set = (Set) listObj;
contains = set.contains(tokenValue);
} else {
throw new IllegalArgumentException("Unknown list representation: " + listObj);
}
return !(contains && negation || !contains && !negation);
}
/**
* determination of sentence type values: decl, excl, interrog, interrogYN or interrogW
*
* @param tokens
* tokens
* @return sentenceType
*/
protected String getSentenceType(NodeList tokens) {
String sentenceType = "decl";
for (int i = tokens.getLength() - 1; i >= 0; i--) { // search for sentence finishing punctuation mark
Element t = (Element) tokens.item(i);
String punct = MaryDomUtils.tokenText(t);
if (punct.equals(".")) {
sentenceType = "decl";
break;
} else if (punct.equals("!")) {
sentenceType = "excl";
break;
} else if (punct.equals("?")) {
sentenceType = "interrog";
break;
}
}
if (sentenceType.equals("interrog")) {
for (int i = 0; i < tokens.getLength() - 1; i++) { // search for the first word in sentence
Element t = (Element) tokens.item(i);
if (!t.getAttribute("ph").equals("")) {
Element firstToken = (Element) tokens.item(i);
// setInterrogYN contains possible part of speechs of first word in yes-no question
Set setInterrogYN = (Set) listMap.get("firstPosInQuestionYN");
// setInterrogW contains possible part of speechs of first word in wh-question
Set setInterrogW = (Set) listMap.get("firstPosInQuestionW");
String posFirstWord = firstToken.getAttribute("pos");
if (setInterrogYN != null && setInterrogYN.contains(posFirstWord)) {
sentenceType = "interrogYN";
}
if (setInterrogW != null && setInterrogW.contains(posFirstWord)) {
sentenceType = "interrogW";
}
break;
}
}
}
return sentenceType;
}
/**
* Assign an accent to the given token.
*
* @param token
* a token element
* @param accent
* the accent string to assign.
*/
protected void setAccent(Element token, String accent) {
token.setAttribute("accent", accent);
}
/**
* Insert a boundary after token, with the given tone and breakindex. If a boundary element already exists after token (but
* before the following token), it is reused, if both token and boundary have the same parent node. In addition, if token is
* punctuation, a boundary preceding token can be reused, if both have the same parent node. When choosing between the values
* already given in the existing element and the ones passed as arguments to this function, the higher / more concrete values
* are taken: Only if bi is higher than an already existing breakindex, the old value is replaced with bi. Only if tone is a
* concrete tone (like "h-") and the previous tone was "unknown" or not specified at all, tone is taken into account.
*
* @param token
* token
* @param tone
* tone
* @param bi
* bi
* @return the boundary element on success, null on failure.
*/
protected Element insertBoundary(Element token, String tone, int bi) {
// Search for an existing boundary after token
Element boundary = null;
logger.debug("insertBoundary: after token `" + MaryDomUtils.tokenText(token) + "', tone " + tone + ", bi " + bi);
Document doc = token.getOwnerDocument();
TreeWalker tw = ((DocumentTraversal) doc).createTreeWalker(DomUtils.getAncestor(token, MaryXML.SENTENCE),
NodeFilter.SHOW_ELEMENT, new NameNodeFilter(new String[] { MaryXML.BOUNDARY, MaryXML.TOKEN }), false);
tw.setCurrentNode(token);
Element next = (Element) tw.nextNode();
if (next != null && next.getTagName().equals(MaryXML.BOUNDARY)) {
boundary = next;
} else if (isPunctuation(token)) {
// if the current token is punctuation, we also look for a
// boundary before the current token
tw.setCurrentNode(token);
Element prev = (Element) tw.previousNode();
if (prev != null && prev.getTagName().equals(MaryXML.BOUNDARY)) {
boundary = prev;
}
}
// Reuse a boundary tag if it has the same parent as the token
if (boundary != null && boundary.getParentNode().equals(token.getParentNode())) {
// the tone:
if (tone != null) {
String tagTone = boundary.getAttribute("tone");
// Use tone given as parameter to this method:
// - if no tone attribute is given in the tag, or
// - if tone parameter is a concrete tone symbol and
// tagTone is "unknown"
if (tagTone.equals("") || !tone.equals("unknown") && tagTone.equals("unknown")) {
boundary.setAttribute("tone", tone);
}
}
// the break index:
if (bi > 0) {
String tagBIString = boundary.getAttribute("breakindex");
// Use bi given as parameter to this method:
// - if no breakindex attribute is given in the tag
// - if bi is a larger breakindex than tagBI
if (tagBIString.equals("") || tagBIString.equals("unknown")) {
boundary.setAttribute("breakindex", String.valueOf(bi));
} /*
* else { try { int tagBI = Integer.parseInt(tagBIString); if (tagBI < bi) { boundary.setAttribute("breakindex",
* String.valueOf(bi)); } } catch (NumberFormatException e) { } // ignore, do nothing }
*/
}
} else { // no boundary tag yet, introduce one
// First verify that we have a valid parent element
if (token.getParentNode() == null) {
return null;
}
// Make sure not to insert the new boundary
// in the middle of an element:
Element eIn = (Element) token.getParentNode();
Element eBefore = MaryDomUtils.getNextSiblingElement(token);
// Now change these insertion references in case token
// is the last one in an tag.
Element mtu = (Element) MaryDomUtils.getHighestLevelAncestor(token, MaryXML.MTU);
if (mtu != null) {
if (MaryDomUtils.isLastOfItsKindIn(token, mtu)) {
eIn = (Element) mtu.getParentNode();
eBefore = MaryDomUtils.getNextSiblingElement(mtu);
} else {
// token is in the middle of an mtu - don't insert boundary
return null;
}
}
// Now the boundary tag is to be inserted.
boundary = MaryXML.createElement(doc, MaryXML.BOUNDARY);
if (tone != null) {
boundary.setAttribute("tone", tone);
}
if (bi > 0) {
boundary.setAttribute("breakindex", String.valueOf(bi));
}
eIn.insertBefore(boundary, eBefore);
} // add new boundary
return boundary;
}
/**
* Insert a major boundary after token number i
in tokens
.
*
* Also inserts a phrase tag at the appropriate position.
*
* @param tokens
* tokens
* @param i
* i
* @param firstToken
* firstToken
* @param tone
* tone
* @param breakindex
* breakindex
* @return The boundary element.
*/
protected Element insertMajorBoundary(NodeList tokens, int i, Element firstToken, String tone, int breakindex) {
Element boundary = insertBoundary((Element) tokens.item(i), tone, breakindex);
insertPhraseNode(firstToken, boundary);
return boundary;
}
/**
* Inserte a phrase element, enclosing the first and last element, into the tree. Typically first element would be a token,
* last element a boundary.
*
* @param first
* first
* @param last
* last
* @return true on success, false on failure.
*/
protected boolean insertPhraseNode(Element first, Element last) {
// Allow for the exotic case that a should start with a element:
Element encloseFromHere = first;
Element maybeBoundary = DomUtils.getPreviousSiblingElement(first);
if (maybeBoundary != null && maybeBoundary.getTagName().equals(MaryXML.BOUNDARY)) {
encloseFromHere = maybeBoundary;
}
// Take existing trailing boundary elements into the new phrase:
Element encloseToHere = last;
maybeBoundary = DomUtils.getNextSiblingElement(last);
if (maybeBoundary != null && maybeBoundary.getTagName().equals(MaryXML.BOUNDARY)) {
encloseToHere = maybeBoundary;
}
Element phrase = MaryDomUtils.encloseNodesWithNewElement(encloseFromHere, encloseToHere, MaryXML.PHRASE);
return phrase != null;
}
/**
* Verify whether this Node has a parent preventing the application of intonation rules.
*
* @param n
* n
* @return true
if rules are to be applied, false
otherwise.
*/
protected boolean applyRules(Node n) {
Element intonation = (Element) MaryDomUtils.getAncestor(n, MaryXML.PROSODY);
return intonation == null || !intonation.getAttribute("rules").equals("off");
}
/**
* Go through all tokens in a document, and copy any accents to the first accented syllable.
*
* @param doc
* doc
*/
protected void copyAccentsToSyllables(Document doc) {
NodeIterator tIt = ((DocumentTraversal) doc).createNodeIterator(doc, NodeFilter.SHOW_ELEMENT, new NameNodeFilter(
MaryXML.TOKEN), false);
Element t = null;
while ((t = (Element) tIt.nextNode()) != null) {
if (t.hasAttribute("accent")) {
NodeIterator sylIt = ((DocumentTraversal) doc).createNodeIterator(t, NodeFilter.SHOW_ELEMENT, new NameNodeFilter(
MaryXML.SYLLABLE), false);
boolean assignedAccent = false;
Element syl = null;
while ((syl = (Element) sylIt.nextNode()) != null) {
if (syl.getAttribute("stress").equals("1")) {
// found
syl.setAttribute("accent", t.getAttribute("accent"));
assignedAccent = true;
break; // done for this token
}
}
if (!assignedAccent) {
// Hmm, this token does not have a stressed syllable --
// take the first syllable then:
syl = MaryDomUtils.getFirstElementByTagName(t, MaryXML.SYLLABLE);
if (syl != null) {
syl.setAttribute("accent", t.getAttribute("accent"));
}
}
}
}
}
/**
* Check whether token
is enclosed by a <prosody>
element containing an attribute
* force-accent
.
*
* @param token
* token
* @return the value of the force-accent
attribute, if one exists, or the empty string otherwise.
*/
protected String getForceAccent(Element token) {
// Search for the closest ancestor element
// which has a "force-accent" attribute:
Element p = MaryDomUtils.getClosestAncestorWithAttribute(token, MaryXML.PROSODY, "force-accent");
if (p != null)
return p.getAttribute("force-accent");
else
return "";
}
/**
* Verify whether a given token is a punctuation.
*
* @param token
* the t element to be tested.
* @return true if token is a punctuation, false otherwise.
*/
protected boolean isPunctuation(Element token) {
if (token == null)
throw new NullPointerException("Received null token");
if (!token.getTagName().equals(MaryXML.TOKEN))
throw new IllegalArgumentException("Expected <" + MaryXML.TOKEN + "> element, got <" + token.getTagName() + ">");
String tokenText = MaryDomUtils.tokenText(token);
return tokenText.equals(",") || tokenText.equals(".") || tokenText.equals("?") || tokenText.equals("!")
|| tokenText.equals(":") || tokenText.equals(";");
}
}