All Downloads are FREE. Search and download functionalities are using the official Maven repository.

marytts.tools.newlanguage.LexiconCreator Maven / Gradle / Ivy

The newest version!
/**
 * Copyright 2008 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * This file is part of MARY TTS.
 *
 * MARY TTS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see .
 *
 */
package marytts.tools.newlanguage;

import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import marytts.cart.CART;
import marytts.cart.io.MaryCARTReader;
import marytts.cart.io.MaryCARTWriter;
import marytts.exceptions.MaryConfigurationException;
import marytts.fst.AlignerTrainer;
import marytts.fst.FSTLookup;
import marytts.fst.TransducerTrie;
import marytts.modules.phonemiser.AllophoneSet;
import marytts.modules.phonemiser.TrainedLTS;
import marytts.util.MaryUtils;

import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.ConsoleAppender;
import org.apache.log4j.Logger;
import org.apache.log4j.PatternLayout;

/**
 * The LexiconCreator is the base class for creating the files needed to run the phonemiser component for a new language. From a
 * list of phonetically transcribed words, the class will create:
 * 
    *
  • a lexicon file, efficiently stored as a Finite State Transducer;
  • *
  • a letter-to-sound prediction file, as a decision tree in MARY format.
  • *
* * The input file is expected to contain data in the following format: * grapheme | ' a l - l o - p h o n e s | (optional-part-of-speech) Hereby, the allophones must correspond to a * defined allophone set, given in the constructor. The file's encoding is expected to be UTF-8. Subclasses of LexiconCreator can * override prepareLexicon() to provide data in this format. * * @see AllophoneSet * @author marc * */ public class LexiconCreator { protected Logger logger; protected AllophoneSet allophoneSet; protected String lexiconFilename; protected String fstFilename; protected String ltsFilename; protected boolean convertToLowercase; protected boolean predictStress; protected int context; /** * Initialise a new lexicon creator. Letter to sound rules built with this lexicon creator will convert graphemes to lowercase * before prediction, using the locale given in the allophone set; letter-to-sound rules will also predict stress; a context * of 2 characters to the left and to the right of the current character will be used as predictive features. * * @param allophoneSet * this specifies the set of phonetic symbols that can be used in the lexicon, and provides the locale of the * lexicon * @param lexiconFilename * where to find the plain-text lexicon * @param fstFilename * where to create the compressed lexicon FST file * @param ltsFilename * where to create the letter-to-sound prediction tree. */ public LexiconCreator(AllophoneSet allophoneSet, String lexiconFilename, String fstFilename, String ltsFilename) { this(allophoneSet, lexiconFilename, fstFilename, ltsFilename, true, true, 2); } /** * Initialize a new lexicon creator. * * @param allophoneSet * this specifies the set of phonetic symbols that can be used in the lexicon, and provides the locale of the * lexicon * @param lexiconFilename * where to find the plain-text lexicon * @param fstFilename * where to create the compressed lexicon FST file * @param ltsFilename * where to create the letter-to-sound prediction tree. * @param convertToLowercase * if true, Letter to sound rules built with this lexicon creator will convert graphemes to lowercase before * prediction, using the locale given in the allophone set. * @param predictStress * if true, letter-to-sound rules will predict stress. * @param context * the number of characters to the left and to the right of the current character will be used as predictive * features. */ public LexiconCreator(AllophoneSet allophoneSet, String lexiconFilename, String fstFilename, String ltsFilename, boolean convertToLowercase, boolean predictStress, int context) { this.allophoneSet = allophoneSet; this.lexiconFilename = lexiconFilename; this.fstFilename = fstFilename; this.ltsFilename = ltsFilename; this.convertToLowercase = convertToLowercase; this.predictStress = predictStress; this.context = context; this.logger = MaryUtils.getLogger("LexiconCreator"); } /** * This base implementation does nothing. Subclasses can override this method to prepare a lexicon in the expected format, * which should then be found at lexiconFilename. * * @throws IOException * IOException */ protected void prepareLexicon() throws IOException { } protected void compileFST() throws IOException { logger.info("Compressing into FST:"); logger.info(" - aligning graphemes and allophones..."); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(lexiconFilename), "UTF-8")); AlignerTrainer at = new AlignerTrainer(false, true); at.readLexicon(br, "\\s*\\|\\s*"); br.close(); // make some alignment iterations for (int i = 0; i < 4; i++) { logger.info(" iteration " + (i + 1)); at.alignIteration(); } logger.info(" - entering alignments in trie..."); TransducerTrie t = new TransducerTrie(); for (int i = 0, size = at.lexiconSize(); i < size; i++) { t.add(at.getAlignment(i)); t.add(at.getInfoAlignment(i)); } logger.info(" - minimizing trie..."); t.computeMinimization(); logger.info(" - writing transducer to disk..."); File of = new File(fstFilename); DataOutputStream os = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(of))); t.writeFST(os, "UTF-8"); os.flush(); os.close(); } protected void testFST() throws IOException { List testGraphemes = new ArrayList(); List testAllophones = new ArrayList(); List testPos = new ArrayList(); int N = 100; // every N'th entry is put into tests... loadTestWords(testGraphemes, testAllophones, testPos, N); logger.info(" - looking up " + testGraphemes.size() + " test words..."); FSTLookup fst = new FSTLookup(fstFilename); for (int i = 0, max = testGraphemes.size(); i < max; i++) { String key = testGraphemes.get(i); String expected = testAllophones.get(i); String[] result = fst.lookup(key); if (testPos.get(i) != null) { String key2 = key + testPos.get(i); String[] result2 = fst.lookup(key2); if (!expected.equals(result2[0])) logger.info(" " + key2 + " -> " + Arrays.toString(result2) + " (expected: " + expected + ")"); // in addition, expected should be one of the results of a lookup without pos boolean found = false; for (String r : result) { if (expected.equals(r)) { found = true; break; } } if (!found) logger.info(" " + key + " -> " + Arrays.toString(result) + " (expected: " + expected + ")"); } else { if (!expected.equals(result[0])) logger.info(" " + key + " -> " + Arrays.toString(result) + " (expected: " + expected + ")"); } } logger.info("...done!\n"); } private void loadTestWords(List testGraphemes, List testAllophones, List testPos, int N) throws UnsupportedEncodingException, FileNotFoundException, IOException { int n = 0; BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(lexiconFilename), "UTF-8")); String line; while ((line = br.readLine()) != null) { String[] parts = line.split("\\s*\\|\\s*"); String graphemes = parts[0]; String allophones = parts[1]; String pos = (parts.length > 2 && parts[2].length() > 0) ? parts[2] : null; n++; if (n == N) { testGraphemes.add(graphemes); testAllophones.add(allophones); testPos.add(pos); n = 0; } } } protected void compileLTS() throws IOException { logger.info("Training letter-to-sound rules..."); // initialize trainer LTSTrainer tp = new LTSTrainer(allophoneSet, convertToLowercase, predictStress, context); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(lexiconFilename), "UTF-8")); logger.info(" - reading lexicon..."); // read lexicon for training tp.readLexicon(br, "\\s*\\|\\s*"); logger.info(" - aligning..."); // make some alignment iterations for (int i = 0; i < 5; i++) { logger.info(" iteration " + (i + 1)); tp.alignIteration(); } logger.info(" - training decision tree..."); CART st = tp.trainTree(10); logger.info(" - saving..."); // new MARY cart format: MaryCARTWriter mcw = new MaryCARTWriter(); mcw.dumpMaryCART(st, ltsFilename); // Alternative ways of saving the CART would be: // MARY cart text format: // PrintWriter pw = new PrintWriter("lib/modules/en/us/lexicon/cmudict.lts.tree.txt", "UTF-8"); // mcw.toTextOut(st, pw); // pw.close(); // old wagon cart, text and binary format: // WagonCARTWriter wcw = new WagonCARTWriter(); // wcw.dumpWagonCART(st, "lib/modules/en/us/lexicon/cmudict.lts.wagontree.binary"); // pw = new PrintWriter("lib/modules/en/us/lexicon/cmudict.lts.wagontree.txt", "UTF-8"); // wcw.toTextOut(st, pw); // pw.close(); // For all of these, it would also be necessary to separately save the feature definition: // pw = new PrintWriter("lib/modules/en/us/lexicon/cmudict.lts.pfeats", "UTF-8"); // st.getFeatureDefinition().writeTo(pw, false); // pw.close(); } protected void testLTS() throws IOException, MaryConfigurationException { List testGraphemes = new ArrayList(); List testAllophones = new ArrayList(); List testPos = new ArrayList(); int N = 100; // every N'th entry is put into tests... loadTestWords(testGraphemes, testAllophones, testPos, N); logger.info(" - loading LTS rules..."); MaryCARTReader cartReader = new MaryCARTReader(); CART st = cartReader.load(ltsFilename); TrainedLTS lts = new TrainedLTS(allophoneSet, st); logger.info(" - looking up " + testGraphemes.size() + " test words..."); int max = testGraphemes.size(); int correct = 0; for (int i = 0; i < max; i++) { String key = testGraphemes.get(i); String expected = testAllophones.get(i); String result = lts.syllabify(lts.predictPronunciation(key)); if (!expected.equals(result)) logger.info(" " + key + " -> " + result + " (expected: " + expected + ")"); else correct++; } logger.info(" for " + correct + " out of " + max + " prediction is identical to lexicon entry."); logger.info("...done!\n"); } public void createLexicon() throws Exception { prepareLexicon(); compileFST(); testFST(); System.gc(); compileLTS(); testLTS(); } /** * @param args * args * @throws Exception * Exception */ public static void main(String[] args) throws Exception { PatternLayout layout = new PatternLayout("%d %m\n"); BasicConfigurator.configure(new ConsoleAppender(layout)); AllophoneSet allophoneSet = AllophoneSet.getAllophoneSet(args[0]); String lexiconFilename = args[1]; String fstFilename = args[2]; String ltsFilename = args[3]; LexiconCreator lc = new LexiconCreator(allophoneSet, lexiconFilename, fstFilename, ltsFilename); lc.createLexicon(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy