marytts.language.fr.Preprocess Maven / Gradle / Ivy
package marytts.language.fr;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import com.ibm.icu.util.ULocale;
import marytts.datatypes.MaryData;
import marytts.datatypes.MaryDataType;
import marytts.datatypes.MaryXML;
import marytts.modules.InternalModule;
import marytts.util.dom.MaryDomUtils;
import marytts.util.dom.NameNodeFilter;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.traversal.DocumentTraversal;
import org.w3c.dom.traversal.NodeFilter;
import org.w3c.dom.traversal.TreeWalker;
import com.ibm.icu.text.RuleBasedNumberFormat;
/**
* @author Tristan Hamilton
*
* Processes cardinal and ordinal numbers.
*/
public class Preprocess extends InternalModule {
private RuleBasedNumberFormat rbnf;
protected final String cardinalRule;
protected final String ordinalRule;
public Preprocess() {
super("Preprocess", MaryDataType.TOKENS, MaryDataType.WORDS, Locale.FRENCH);
this.rbnf = new RuleBasedNumberFormat(ULocale.FRENCH, RuleBasedNumberFormat.SPELLOUT);
this.cardinalRule = "%spellout-numbering";
this.ordinalRule = getOrdinalRuleName(rbnf);
}
public MaryData process(MaryData d) throws Exception {
Document doc = d.getDocument();
checkForNumbers(doc);
MaryData result = new MaryData(getOutputType(), d.getLocale());
result.setDocument(doc);
return result;
}
protected void checkForNumbers(Document doc) {
TreeWalker tw = ((DocumentTraversal) doc).createTreeWalker(doc, NodeFilter.SHOW_ELEMENT,
new NameNodeFilter(MaryXML.TOKEN), false);
Element t = null;
while ((t = (Element) tw.nextNode()) != null) {
if (MaryDomUtils.hasAncestor(t, MaryXML.SAYAS) || t.hasAttribute("ph") || t.hasAttribute("sounds_like")) {
// ignore token
continue;
}
String origText = MaryDomUtils.tokenText(t);
if (MaryDomUtils.tokenText(t).matches("\\d+(e|er|re|ère|ème)")) {
String matched = MaryDomUtils.tokenText(t).split("e|ere|er|re|ère|ème")[0];
if (matched.equals("1")) {
if (MaryDomUtils.tokenText(t).matches("\\d+er")) {
MaryDomUtils.setTokenText(t, expandOrdinal(Double.parseDouble(matched)));
} else {
String s = expandOrdinal(Double.parseDouble(matched));
MaryDomUtils.setTokenText(t, s.replace("ier", "ière"));
}
} else {
MaryDomUtils.setTokenText(t, expandOrdinal(Double.parseDouble(matched)));
}
} else if (MaryDomUtils.tokenText(t).matches("\\d+")) {
MaryDomUtils.setTokenText(t, expandNumber(Double.parseDouble(MaryDomUtils.tokenText(t))));
}
// if token isn't ignored but there is no handling rule don't add MTU
if (!origText.equals(MaryDomUtils.tokenText(t))) {
MaryDomUtils.encloseWithMTU(t, origText, null);
}
}
}
protected String expandNumber(double number) {
this.rbnf.setDefaultRuleSet(cardinalRule);
return this.rbnf.format(number);
}
protected String expandOrdinal(double number) {
this.rbnf.setDefaultRuleSet(ordinalRule);
return this.rbnf.format(number);
}
/**
* Try to extract the rule name for "expand ordinal" from the given RuleBasedNumberFormat.
*
* The rule name is locale sensitive, but usually starts with "%spellout-ordinal".
*
* @param rbnf
* The RuleBasedNumberFormat from where we will try to extract the rule name.
* @return The rule name for "ordinal spell out".
*/
protected static String getOrdinalRuleName(final RuleBasedNumberFormat rbnf) {
List l = Arrays.asList(rbnf.getRuleSetNames());
if (l.contains("%spellout-ordinal")) {
return "%spellout-ordinal";
} else if (l.contains("%spellout-ordinal-masculine")) {
return "%spellout-ordinal-masculine";
} else {
for (String string : l) {
if (string.startsWith("%spellout-ordinal")) {
return string;
}
}
}
throw new UnsupportedOperationException("The locale " + rbnf.getLocale(ULocale.ACTUAL_LOCALE)
+ " doesn't supports ordinal spelling.");
}
}