All Downloads are FREE. Search and download functionalities are using the official Maven repository.

marytts.tools.newlanguage.en_US.CMUDict2MaryFST Maven / Gradle / Ivy

The newest version!
/**
 * Copyright 2007 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * This file is part of MARY TTS.
 *
 * MARY TTS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see .
 *
 */
package marytts.tools.newlanguage.en_US;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;
import java.util.StringTokenizer;

import marytts.modules.phonemiser.AllophoneSet;
import marytts.modules.phonemiser.Syllabifier;
import marytts.tools.newlanguage.LexiconCreator;

import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.ConsoleAppender;
import org.apache.log4j.PatternLayout;

/**
 * This class does a one-time, offline conversion from the CMUDict in Festival format (cmudict-0.4.scm and cmudict_extensions.scm)
 * into MARY format. Specifically, the following steps are performed:
 * 
    *
  1. conversion to a text format without brackets, using '|' as the delimiter between three fields: * graphemes | allophones | part-of-speech(optional)
  2. *
  3. conversion of the phonetic alphabet used from MRPA to SAMPA
  4. *
  5. creation of a compact FST representing the lexicon
  6. *
  7. training of Letter-to-sound rules from the data
  8. *
* * @author marc * */ public class CMUDict2MaryFST extends LexiconCreator { private static final String LEXPATH = "lib/modules/en/us/lexicon/"; public CMUDict2MaryFST() throws Exception { super(AllophoneSet.getAllophoneSet(LEXPATH + "allophones.en_US.xml"), LEXPATH + "cmudictSampa.txt", LEXPATH + "cmudict.fst", LEXPATH + "cmudict.lts", true, // convert to lowercase true, // predict stress 3 // number of characters to the left and to the right to use for prediction ); } @Override protected void prepareLexicon() throws IOException { File cmudict = new File(LEXPATH + "cmu/cmudict-0.4.scm"); if (!cmudict.exists()) throw new IllegalStateException("This program should be called from the MARY base directory."); File extensions = new File(LEXPATH + "cmu/cmudict_extensions.scm"); File cmudictSampa = new File(lexiconFilename); // Convert to SAMPA text dictionary logger.info("Converting dictionary to MARY text format..."); mrpa2sampa = new HashMap(); fillSampaMap(); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(cmudict), "ASCII")); PrintWriter toSampa = new PrintWriter(cmudictSampa, "UTF-8"); convertToSampa(br, toSampa); br.close(); br = new BufferedReader(new InputStreamReader(new FileInputStream(extensions), "ASCII")); convertToSampa(br, toSampa); br.close(); toSampa.close(); logger.info("...done!\n"); } private Map mrpa2sampa; private void fillSampaMap() { // Any phone inventory mappings? String sampamapFilename = "lib/modules/en/synthesis/sampa2mrpa_en.map"; try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(sampamapFilename), "UTF-8")); String line; while ((line = br.readLine()) != null) { line = line.trim(); if (line.equals("") || line.startsWith("#")) { continue; // ignore empty and comment lines } try { addSampaMapEntry(line); } catch (IllegalArgumentException iae) { throw new IllegalArgumentException("Ignoring invalid entry in sampa map file " + sampamapFilename, iae); } } } catch (IOException ioe) { throw new IllegalArgumentException("Cannot open file '" + sampamapFilename + "'", ioe); } } private void addSampaMapEntry(String entry) throws IllegalArgumentException { boolean s2v = false; boolean v2s = false; String[] parts = null; // For one-to-many mappings, '+' can be used to group phone symbols. // E.g., the line "EI->E:+I" would map "EI" to "E:" and "I" entry = entry.replace('+', ' '); if (entry.indexOf("<->") != -1) { parts = entry.split("<->"); s2v = true; v2s = true; } else if (entry.indexOf("->") != -1) { parts = entry.split("->"); s2v = true; } else if (entry.indexOf("<-") != -1) { parts = entry.split("<-"); v2s = true; } if (parts == null || parts.length != 2) { // invalid entry throw new IllegalArgumentException(); } if (v2s) { mrpa2sampa.put(parts[1].trim(), parts[0].trim()); } } /** * Converts a single phonetic symbol in MRPA representation representation into its equivalent in MARY sampa representation. * * @param voicePhoneme * voicePhoneme * @return the converted phone, or the input string if no known conversion exists. */ private String mrpa2sampa(String voicePhoneme) { if (mrpa2sampa.containsKey(voicePhoneme)) return mrpa2sampa.get(voicePhoneme); else return voicePhoneme; } private String mrpaString2sampaString(String mrpaString) { StringTokenizer st = new StringTokenizer(mrpaString); LinkedList sampaList = new LinkedList(); while (st.hasMoreTokens()) { String mrpa = st.nextToken(); String sampa; if (mrpa.endsWith("1")) { sampa = mrpa2sampa(mrpa.substring(0, mrpa.length() - 1)) + "1"; } else if (mrpa.endsWith("0")) { sampa = mrpa2sampa(mrpa.substring(0, mrpa.length() - 1)); } else { sampa = mrpa2sampa(mrpa); } sampaList.add(sampa); } new Syllabifier(allophoneSet).syllabify(sampaList); StringBuilder sb = new StringBuilder(); for (String s : sampaList) { if (sb.length() > 0) sb.append(" "); sb.append(s); } return sb.toString(); } private void convertToSampa(BufferedReader br, PrintWriter toSampa) throws IOException { String line; while ((line = br.readLine()) != null) { line = line.trim(); // skip comments: if (line.startsWith(";") || line.equals("")) continue; // expected line format: // ("acquirer" nil (ax k w ay1 er0 er0)) int firstQuote = line.indexOf('"'); if (!(firstQuote >= 0)) { System.err.println("Skipping strange line (no first quote): " + line); } int secondQuote = line.indexOf('"', firstQuote + 1); if (!(secondQuote > firstQuote)) { System.err.println("Skipping strange line (no second quote): " + line); } int firstSpace = secondQuote + 1; if (!(line.charAt(firstSpace) == ' ')) { System.err.println("Skipping strange line (no first space): " + line); } int secondSpace = line.indexOf(' ', firstSpace + 1); if (!(secondSpace > firstSpace)) { System.err.println("Skipping strange line (no second space): " + line); } int firstBracket = secondSpace + 1; if (!(line.charAt(firstBracket) == '(')) { System.err.println("Skipping strange line (no first bracket): " + line); } int secondBracket = line.indexOf(')', firstBracket + 1); if (!(secondBracket > firstBracket)) { System.err.println("Skipping strange line (no second bracket): " + line); } String graphemes = line.substring(firstQuote + 1, secondQuote); String pos = line.substring(firstSpace + 1, secondSpace); if (pos.equals("nil")) pos = ""; else pos = "(" + pos + ")"; String allophones = line.substring(firstBracket + 1, secondBracket); String sampaString = mrpaString2sampaString(allophones); toSampa.println(graphemes + " | " + sampaString + " | " + pos); } } /** * @param args * args * @throws Exception * Exception */ public static void main(String[] args) throws Exception { PatternLayout layout = new PatternLayout("%d %m\n"); BasicConfigurator.configure(new ConsoleAppender(layout)); CMUDict2MaryFST c2m = new CMUDict2MaryFST(); c2m.createLexicon(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy