marytts.modules.acoustic.HMMModel Maven / Gradle / Ivy
The newest version!
/**
* Copyright 2010 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see .
*
*/
package marytts.modules.acoustic;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.WeakHashMap;
import marytts.exceptions.MaryConfigurationException;
import marytts.features.FeatureDefinition;
import marytts.features.FeatureProcessorManager;
import marytts.features.FeatureVector;
import marytts.htsengine.CartTreeSet;
import marytts.htsengine.HMMData;
import marytts.htsengine.HTSModel;
import marytts.htsengine.HTSParameterGeneration;
import marytts.htsengine.HTSUttModel;
import marytts.unitselection.select.Target;
import marytts.util.MaryUtils;
import org.apache.log4j.Logger;
import org.w3c.dom.Element;
/**
* Model for predicting duration and F0 from HMMs
*
* @author marcela
*
*/
public class HMMModel extends Model {
/**
* Configuration information of the model
*/
private HMMData htsData = null;
/**
* HMM trees and pdfs for this model.
*/
private CartTreeSet cart;
/**
* Feature definition used when training HMMs.
*/
FeatureDefinition hmmFeatureDefinition;
/**
* to calculate duration in seconds.
*/
private float fperiodsec;
protected static Logger logger = MaryUtils.getLogger("HMMModel");
/**
* If the model is instantiated because the same HHMModel is used for predicting both F0 and duration, set this variable true;
* this is done in Voice.loadAcousticModels(), when creating the models.
*/
private boolean predictDurAndF0 = false;
/**
* This list keeps a copy of the utterance model, this is done when the same HMMModel is used for predicting durations and F0,
* the idea is to keep in the utterance model list the state durations predicted together with duration, these state durations
* are used when predicting F0, so the same state duration is applied.
*/
private Map, HTSUttModel> uttModels = new WeakHashMap, HTSUttModel>();
/**
* Model constructor
*
* @param featureManager
* the feature processor manager used to compute the symbolic features used for prediction
* @param voiceName
* in HMM models this data file corresponds to the configuration file of the HMM voice
* @param dataStream
* dataStream
* @param targetAttributeName
* attribute in MARYXML to predict
* @param targetAttributeFormat
* print style, not used in HMM models
* @param featureName
* not used in HMMModel
* @param predictFrom
* not used in HMMModel
* @param applyTo
* not used in HMMModel
*
* @throws MaryConfigurationException
* if there are missing files or problems loading trees and pdf files.
*/
public HMMModel(FeatureProcessorManager featureManager, String voiceName, InputStream dataStream, String targetAttributeName,
String targetAttributeFormat, String featureName, String predictFrom, String applyTo)
throws MaryConfigurationException {
super(featureManager, voiceName, dataStream, targetAttributeName, targetAttributeFormat, featureName, predictFrom,
applyTo);
if (!(targetAttributeName.contentEquals("d") || targetAttributeName.contentEquals("f0"))) {
throw new MaryConfigurationException("targetAttributeName = " + targetAttributeName + " Not known");
}
load();
}
/**
* This variable is set to true whenever the same HMMModel is used to predict both duration and F0. by default the variable is
* false, so that means that two different HMMModels are used for predicting duration and F0, in this case there is no state
* durations information to predict F0.
*
* @param bval
* bval
*/
public void setPredictDurAndF0(boolean bval) {
predictDurAndF0 = bval;
}
/**
* Load trees and pdfs, from HMM configuration file.
*
* @throws MaryConfigurationException
* if there are missing files or problems loading trees and pdf files.
*/
@Override
protected void loadData() throws IOException, MaryConfigurationException {
if (htsData == null)
htsData = new HMMData();
// we use the configuration of the HMM voice whose hmm models will be used
htsData.initHMMDataForHMMModel(voiceName);
cart = htsData.getCartTreeSet();
fperiodsec = ((float) htsData.getFperiod() / (float) htsData.getRate());
predictionFeatureNames = htsData.getFeatureDefinition().getFeatureNames();
}
/**
* Predict duration for the list of elements. If the same HMMModel is used to predict duration and F0 then a utterance model
* is created and kept in a WeakHashMap, so the next call to this module, for predicting F0, can use that utterance model.
*
* @param elements
* elements from MaryXML for which to predict the values
*
* @throws MaryConfigurationException
* if error searching in HMM trees.
*/
@Override
public void applyTo(List elements) throws MaryConfigurationException {
logger.debug("predicting duration");
HTSUttModel um = predictAndSetDuration(elements, elements);
if (predictDurAndF0) { // this same model will be used for predicting F0 -- remember um
uttModels.put(elements, um);
}
}
/**
* Predict F0 for the list of elements and apply to another list of elements. If the same HMMModel is used to predict duration
* and F0 then there must be a utterance model created in a previous call to this module, that will be used to predict F0. If
* there is no previously created utterance model then one is created.
*
* @param predictFromElements
* elements from MaryXML for which to predict the values
* @param applyToElements
* elements from MaryXML for which to apply the predicted values
*
* @throws MaryConfigurationException
* if error searching in HMM trees.
*/
@Override
public void applyFromTo(List predictFromElements, List applyToElements) throws MaryConfigurationException {
logger.debug("predicting F0");
// Two possibilities: Either we have an uttModel due to a previous call to applyTo()
// (in which case the lookup key should be applyToElements),
// or we don't -- in which case we must create an uttModel from the XML.
HTSUttModel um;
if (predictDurAndF0) {
logger.debug("using already created utterance model, it contains predicted state durations.");
um = uttModels.get(applyToElements); // it must be already created so get it from the uttModels Map
} else {
logger.debug("creating utterance model with equal values for state durations.");
um = createUttModel(predictFromElements); // create a um, state durations are set equal for all states
}
assert um != null;
predictAndSetF0(applyToElements, um);
}
/**
* Predict durations and state durations from predictFromElements and apply durations to applyToElements. A utterance model is
* created that contains the predicted state durations.
*
* @param predictFromElements
* elements to predict from
* @param applyToElements
* elements to apply predicted durations
*
* @return HTSUttModel a utterance model
*
* @throws MaryConfigurationException
* if error searching in HMM trees.
*/
private HTSUttModel predictAndSetDuration(List predictFromElements, List applyToElements)
throws MaryConfigurationException {
List predictorElements = predictFromElements;
List predictorTargets = getTargets(predictorElements);
FeatureVector fv = null;
HTSUttModel um = new HTSUttModel();
FeatureDefinition feaDef = htsData.getFeatureDefinition();
double diffdurOld = 0.0;
double diffdurNew = 0.0;
String durAttributeName = "d";
try {
// (1) Predict the values
for (int i = 0; i < predictorTargets.size(); i++) {
// Retrieve values
fv = predictorTargets.get(i).getFeatureVector();
um.addUttModel(new HTSModel(cart.getNumStates()));
HTSModel m = um.getUttModel(i);
Element element = applyToElements.get(i);
/* this function also sets the phone name, the phone between - and + */
m.setPhoneName(fv.getFeatureAsString(feaDef.getFeatureIndex("phone"), feaDef));
/* Check if context-dependent gv (gv without sil) */
if (htsData.getUseContextDependentGV()) {
if (m.getPhoneName().contentEquals("_"))
m.setGvSwitch(false);
}
/* increment number of models in utterance model */
um.setNumModel(um.getNumModel() + 1);
/* update number of states */
um.setNumState(um.getNumState() + cart.getNumStates());
String formattedTargetValue;
double duration;
// if the attribute already exists for this element keep it
if (element.hasAttribute(durAttributeName)) {
// Element is in milliseconds already, so convert to second in order to get reformatted in ms après
formattedTargetValue = element.getAttribute(durAttributeName);
duration = Float.parseFloat(formattedTargetValue) / 1000;
formattedTargetValue = String.format(targetAttributeFormat, duration);
um.setTotalFrame(um.getTotalFrame() + (int) Math.round(duration / fperiodsec));
} else {
// Estimate state duration from state duration model (Gaussian)
diffdurNew = cart.searchDurInCartTree(m, fv, htsData, diffdurOld);
diffdurOld = diffdurNew;
duration = m.getTotalDur() * fperiodsec; // in seconds
um.setTotalFrame(um.getTotalFrame() + m.getTotalDur());
}
/*
* Find pdf for LF0, this function sets the pdf for each state. and determines, according to the HMM models,
* whether the states are voiced or unvoiced, (it can be possible that some states are voiced and some unvoiced).
*/
cart.searchLf0InCartTree(m, fv, feaDef, htsData.getUV());
for (int mstate = 0; mstate < cart.getNumStates(); mstate++) {
for (int frame = 0; frame < m.getDur(mstate); frame++) {
if (m.getVoiced(mstate))
um.setLf0Frame(um.getLf0Frame() + 1);
}
}
// "evaluate" pseudo XPath syntax:
// TODO this needs to be extended to take into account targetAttributeNames like "foo/@bar", which would add the
// bar attribute to the foo child of this element, creating the child if not already present...
if (durAttributeName.startsWith("@")) {
durAttributeName = durAttributeName.replaceFirst("@", "");
}
formattedTargetValue = String.format(targetAttributeFormat, duration);
// set the new attribute value:
element.setAttribute(durAttributeName, formattedTargetValue);
}
return um;
} catch (Exception e) {
throw new MaryConfigurationException("Error searching in tree when predicting duration. ", e);
}
}
/**
* Predict F0 from the utterance model and apply to elements
*
* @param applyToElements
* elements to apply predicted F0s
* @param um
* utterance model that contains the set of elements (phonemes) and state durations for generating F0.
*
* @throws MaryConfigurationException
* if error generating F0 out of HMMs trees and pdfs.
*/
private void predictAndSetF0(List applyToElements, HTSUttModel um) throws MaryConfigurationException {
HTSModel m;
try {
String f0AttributeName = "f0";
HTSParameterGeneration pdf2par = new HTSParameterGeneration();
/* Once we have all the phone models Process UttModel */
/* Generate sequence of speech parameter vectors, generate parameters out of sequence of pdf's */
boolean debug = false; /* so it does not save the generated parameters. */
/* this function generates features just for the trees and pdf that are not null in the HMM cart */
pdf2par.htsMaximumLikelihoodParameterGeneration(um, htsData);
// (2) include the predicted values in applicableElements (as it is done in Model)
boolean voiced[] = pdf2par.getVoicedArray();
int numVoiced = 0;
// make sure that the number of applicable elements is the same as the predicted number of elements
assert applyToElements.size() == um.getNumModel();
float f0;
String formattedTargetValue;
int t = 0;
for (int i = 0; i < applyToElements.size(); i++) { // this will be the same as the utterance model set
m = um.getUttModel(i);
int k = 1;
int numVoicedInModel = m.getNumVoiced();
formattedTargetValue = "";
// System.out.format("phone = %s dur_in_frames=%d num_voiced_frames=%d : ", m.getPhoneName(), m.getTotalDur(),
// numVoicedInModel);
for (int mstate = 0; mstate < cart.getNumStates(); mstate++) {
for (int frame = 0; frame < m.getDur(mstate); frame++) {
if (voiced[t++]) { // numVoiced and t are not the same because voiced values can be true or false,
// numVoiced count just the voiced
f0 = (float) Math.exp(pdf2par.getlf0Pst().getPar(numVoiced++, 0));
formattedTargetValue += "(" + Integer.toString((int) ((k * 100.0) / numVoicedInModel)) + ","
+ Integer.toString((int) f0) + ")";
k++;
}
}
}
Element element = applyToElements.get(i);
// "evaluate" pseudo XPath syntax:
// TODO this needs to be extended to take into account targetAttributeNames like "foo/@bar", which would add the
// bar attribute to the foo child of this element, creating the child if not already present...
if (f0AttributeName.startsWith("@")) {
f0AttributeName = f0AttributeName.replaceFirst("@", "");
}
// format targetValue according to targetAttributeFormat
// String formattedTargetValue = String.format(targetAttributeFormat, targetValue);
// set the new attribute value:
// if the whole segment is unvoiced then f0 should not be fixed?
if (formattedTargetValue.length() > 0)
element.setAttribute(f0AttributeName, formattedTargetValue);
// System.out.println(formattedTargetValue);
}
// once finished re-set to null um
// um = null;
} catch (Exception e) {
throw new MaryConfigurationException("Error generating F0 out of HMMs trees and pdfs. ", e);
}
}
/**
* Create a utterance model list from feature vectors predicted from elements.
*
* @param predictFromElements
* elements from MaryXML from where to get feature vectors.
*
* @return Utterance model um containing state durations and pdfs already searched on the trees to generate F0.
*
* @throws MaryConfigurationException
* if error searching in HMM trees.
*/
private HTSUttModel createUttModel(List predictFromElements) throws MaryConfigurationException {
int i, k, s, t, mstate, frame, durInFrames, durStateInFrames, numVoicedInModel;
HTSModel m;
List predictorElements = predictFromElements;
List predictorTargets = getTargets(predictorElements);
FeatureVector fv;
HTSUttModel um = new HTSUttModel();
FeatureDefinition feaDef = htsData.getFeatureDefinition();
float duration;
double diffdurOld = 0.0;
double diffdurNew = 0.0;
float f0s[] = null;
try {
// (1) Predict the values
for (i = 0; i < predictorTargets.size(); i++) {
fv = predictorTargets.get(i).getFeatureVector();
Element e = predictFromElements.get(i);
um.addUttModel(new HTSModel(cart.getNumStates()));
m = um.getUttModel(i);
/* this function also sets the phone name, the phone between - and + */
m.setPhoneName(fv.getFeatureAsString(feaDef.getFeatureIndex("phone"), feaDef));
/* Check if context-dependent gv (gv without sil) */
if (htsData.getUseContextDependentGV()) {
if (m.getPhoneName().contentEquals("_"))
m.setGvSwitch(false);
}
/* increment number of models in utterance model */
um.setNumModel(um.getNumModel() + 1);
/* update number of states */
um.setNumState(um.getNumState() + cart.getNumStates());
// get the duration from the element
duration = Integer.parseInt(e.getAttribute("d")) * 0.001f; // in sec.
// distribute the duration (in frames) among the five states, here it is done the same amount for each state
durInFrames = (int) (duration / fperiodsec);
durStateInFrames = (int) (durInFrames / cart.getNumStates());
m.setTotalDur(0); // reset to set new value according to duration
for (s = 0; s < cart.getNumStates(); s++) {
m.setDur(s, durStateInFrames);
m.setTotalDur(m.getTotalDur() + m.getDur(s));
}
um.setTotalFrame(um.getTotalFrame() + m.getTotalDur());
System.out.format("createUttModel: duration=%.3f sec. durInFrames=%d durStateInFrames=%d m.getTotalDur()=%d\n",
duration, durInFrames, durStateInFrames, m.getTotalDur());
/*
* Find pdf for LF0, this function sets the pdf for each state. and determines, according to the HMM models,
* whether the states are voiced or unvoiced, (it can be possible that some states are voiced and some unvoiced).
*/
cart.searchLf0InCartTree(m, fv, feaDef, htsData.getUV());
for (mstate = 0; mstate < cart.getNumStates(); mstate++) {
for (frame = 0; frame < m.getDur(mstate); frame++)
if (m.getVoiced(mstate))
um.setLf0Frame(um.getLf0Frame() + 1);
}
}
return um;
} catch (Exception e) {
throw new MaryConfigurationException("Error searching in tree when creating utterance model. ", e);
}
}
/**
* Apply the HMM to a Target to get its predicted value, this method is not used in HMMModel.
*
* @throws RuntimeException
* if this method is called.
*/
@Override
protected float evaluate(Target target) {
throw new RuntimeException("This method should never be called");
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy