marytts.tools.newlanguage.en_US.CMUDict2MaryFST Maven / Gradle / Ivy
The newest version!
/**
* Copyright 2007 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see .
*
*/
package marytts.tools.newlanguage.en_US;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;
import java.util.StringTokenizer;
import marytts.modules.phonemiser.AllophoneSet;
import marytts.modules.phonemiser.Syllabifier;
import marytts.tools.newlanguage.LexiconCreator;
import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.ConsoleAppender;
import org.apache.log4j.PatternLayout;
/**
* This class does a one-time, offline conversion from the CMUDict in Festival format (cmudict-0.4.scm and cmudict_extensions.scm)
* into MARY format. Specifically, the following steps are performed:
*
* - conversion to a text format without brackets, using '|' as the delimiter between three fields:
*
graphemes | allophones | part-of-speech(optional)
* - conversion of the phonetic alphabet used from MRPA to SAMPA
* - creation of a compact FST representing the lexicon
* - training of Letter-to-sound rules from the data
*
*
* @author marc
*
*/
public class CMUDict2MaryFST extends LexiconCreator {
private static final String LEXPATH = "lib/modules/en/us/lexicon/";
public CMUDict2MaryFST() throws Exception {
super(AllophoneSet.getAllophoneSet(LEXPATH + "allophones.en_US.xml"), LEXPATH + "cmudictSampa.txt", LEXPATH
+ "cmudict.fst", LEXPATH + "cmudict.lts", true, // convert to lowercase
true, // predict stress
3 // number of characters to the left and to the right to use for prediction
);
}
@Override
protected void prepareLexicon() throws IOException {
File cmudict = new File(LEXPATH + "cmu/cmudict-0.4.scm");
if (!cmudict.exists())
throw new IllegalStateException("This program should be called from the MARY base directory.");
File extensions = new File(LEXPATH + "cmu/cmudict_extensions.scm");
File cmudictSampa = new File(lexiconFilename);
// Convert to SAMPA text dictionary
logger.info("Converting dictionary to MARY text format...");
mrpa2sampa = new HashMap();
fillSampaMap();
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(cmudict), "ASCII"));
PrintWriter toSampa = new PrintWriter(cmudictSampa, "UTF-8");
convertToSampa(br, toSampa);
br.close();
br = new BufferedReader(new InputStreamReader(new FileInputStream(extensions), "ASCII"));
convertToSampa(br, toSampa);
br.close();
toSampa.close();
logger.info("...done!\n");
}
private Map mrpa2sampa;
private void fillSampaMap() {
// Any phone inventory mappings?
String sampamapFilename = "lib/modules/en/synthesis/sampa2mrpa_en.map";
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(sampamapFilename), "UTF-8"));
String line;
while ((line = br.readLine()) != null) {
line = line.trim();
if (line.equals("") || line.startsWith("#")) {
continue; // ignore empty and comment lines
}
try {
addSampaMapEntry(line);
} catch (IllegalArgumentException iae) {
throw new IllegalArgumentException("Ignoring invalid entry in sampa map file " + sampamapFilename, iae);
}
}
} catch (IOException ioe) {
throw new IllegalArgumentException("Cannot open file '" + sampamapFilename + "'", ioe);
}
}
private void addSampaMapEntry(String entry) throws IllegalArgumentException {
boolean s2v = false;
boolean v2s = false;
String[] parts = null;
// For one-to-many mappings, '+' can be used to group phone symbols.
// E.g., the line "EI->E:+I" would map "EI" to "E:" and "I"
entry = entry.replace('+', ' ');
if (entry.indexOf("<->") != -1) {
parts = entry.split("<->");
s2v = true;
v2s = true;
} else if (entry.indexOf("->") != -1) {
parts = entry.split("->");
s2v = true;
} else if (entry.indexOf("<-") != -1) {
parts = entry.split("<-");
v2s = true;
}
if (parts == null || parts.length != 2) { // invalid entry
throw new IllegalArgumentException();
}
if (v2s) {
mrpa2sampa.put(parts[1].trim(), parts[0].trim());
}
}
/**
* Converts a single phonetic symbol in MRPA representation representation into its equivalent in MARY sampa representation.
*
* @param voicePhoneme
* voicePhoneme
* @return the converted phone, or the input string if no known conversion exists.
*/
private String mrpa2sampa(String voicePhoneme) {
if (mrpa2sampa.containsKey(voicePhoneme))
return mrpa2sampa.get(voicePhoneme);
else
return voicePhoneme;
}
private String mrpaString2sampaString(String mrpaString) {
StringTokenizer st = new StringTokenizer(mrpaString);
LinkedList sampaList = new LinkedList();
while (st.hasMoreTokens()) {
String mrpa = st.nextToken();
String sampa;
if (mrpa.endsWith("1")) {
sampa = mrpa2sampa(mrpa.substring(0, mrpa.length() - 1)) + "1";
} else if (mrpa.endsWith("0")) {
sampa = mrpa2sampa(mrpa.substring(0, mrpa.length() - 1));
} else {
sampa = mrpa2sampa(mrpa);
}
sampaList.add(sampa);
}
new Syllabifier(allophoneSet).syllabify(sampaList);
StringBuilder sb = new StringBuilder();
for (String s : sampaList) {
if (sb.length() > 0)
sb.append(" ");
sb.append(s);
}
return sb.toString();
}
private void convertToSampa(BufferedReader br, PrintWriter toSampa) throws IOException {
String line;
while ((line = br.readLine()) != null) {
line = line.trim();
// skip comments:
if (line.startsWith(";") || line.equals(""))
continue;
// expected line format:
// ("acquirer" nil (ax k w ay1 er0 er0))
int firstQuote = line.indexOf('"');
if (!(firstQuote >= 0)) {
System.err.println("Skipping strange line (no first quote): " + line);
}
int secondQuote = line.indexOf('"', firstQuote + 1);
if (!(secondQuote > firstQuote)) {
System.err.println("Skipping strange line (no second quote): " + line);
}
int firstSpace = secondQuote + 1;
if (!(line.charAt(firstSpace) == ' ')) {
System.err.println("Skipping strange line (no first space): " + line);
}
int secondSpace = line.indexOf(' ', firstSpace + 1);
if (!(secondSpace > firstSpace)) {
System.err.println("Skipping strange line (no second space): " + line);
}
int firstBracket = secondSpace + 1;
if (!(line.charAt(firstBracket) == '(')) {
System.err.println("Skipping strange line (no first bracket): " + line);
}
int secondBracket = line.indexOf(')', firstBracket + 1);
if (!(secondBracket > firstBracket)) {
System.err.println("Skipping strange line (no second bracket): " + line);
}
String graphemes = line.substring(firstQuote + 1, secondQuote);
String pos = line.substring(firstSpace + 1, secondSpace);
if (pos.equals("nil"))
pos = "";
else
pos = "(" + pos + ")";
String allophones = line.substring(firstBracket + 1, secondBracket);
String sampaString = mrpaString2sampaString(allophones);
toSampa.println(graphemes + " | " + sampaString + " | " + pos);
}
}
/**
* @param args
* args
* @throws Exception
* Exception
*/
public static void main(String[] args) throws Exception {
PatternLayout layout = new PatternLayout("%d %m\n");
BasicConfigurator.configure(new ConsoleAppender(layout));
CMUDict2MaryFST c2m = new CMUDict2MaryFST();
c2m.createLexicon();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy