marytts.vocalizations.HNMSynthesisTechnology Maven / Gradle / Ivy
The newest version!
/**
* Copyright 2000-2006 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see .
*
*/
package marytts.vocalizations;
import java.io.IOException;
import java.util.Arrays;
import java.util.LinkedList;
import javax.sound.sampled.AudioFileFormat;
import javax.sound.sampled.AudioFormat;
import javax.sound.sampled.AudioInputStream;
import marytts.exceptions.MaryConfigurationException;
import marytts.exceptions.SynthesisException;
import marytts.signalproc.adaptation.prosody.BasicProsodyModifierParams;
import marytts.signalproc.analysis.RegularizedCepstrumEstimator;
import marytts.signalproc.sinusoidal.hntm.analysis.HntmAnalyzerParams;
import marytts.signalproc.sinusoidal.hntm.analysis.HntmSpeechSignal;
import marytts.signalproc.sinusoidal.hntm.synthesis.HntmSynthesizedSignal;
import marytts.signalproc.sinusoidal.hntm.synthesis.HntmSynthesizer;
import marytts.signalproc.sinusoidal.hntm.synthesis.HntmSynthesizerParams;
import marytts.unitselection.data.TimelineReader;
import marytts.util.data.BufferedDoubleDataSource;
import marytts.util.data.Datagram;
import marytts.util.data.DatagramDoubleDataSource;
import marytts.util.data.DoubleDataSource;
import marytts.util.data.audio.DDSAudioInputStream;
import marytts.util.math.MathUtils;
import marytts.util.math.Polynomial;
/**
* HNM Synthesis technology to synthesize vocalizations
*
* @author Sathish Pammi
*/
public class HNMSynthesisTechnology extends VocalizationSynthesisTechnology {
protected HNMFeatureFileReader vHNMFeaturesReader;
protected VocalizationIntonationReader vIntonationReader;
protected HntmAnalyzerParams analysisParams;
protected HntmSynthesizerParams synthesisParams;
protected TimelineReader audioTimeline;
protected VocalizationUnitFileReader unitFileReader;
protected boolean f0ContourImposeSupport;
public HNMSynthesisTechnology(String waveTimeLineFile, String unitFile, String hnmFeatureFile, String intonationFeatureFile,
boolean imposeF0Support) throws MaryConfigurationException {
try {
this.audioTimeline = new TimelineReader(waveTimeLineFile);
this.unitFileReader = new VocalizationUnitFileReader(unitFile);
this.f0ContourImposeSupport = imposeF0Support;
this.vHNMFeaturesReader = new HNMFeatureFileReader(hnmFeatureFile);
if (f0ContourImposeSupport) {
this.vIntonationReader = new VocalizationIntonationReader(intonationFeatureFile);
} else {
this.vIntonationReader = null;
}
} catch (IOException e) {
throw new MaryConfigurationException("Can not read data from files " + e);
}
initializeParameters();
}
public HNMSynthesisTechnology(TimelineReader audioTimeline, VocalizationUnitFileReader unitFileReader,
HNMFeatureFileReader vHNMFeaturesReader, VocalizationIntonationReader vIntonationReader, boolean imposeF0Support) {
this.audioTimeline = audioTimeline;
this.unitFileReader = unitFileReader;
this.vHNMFeaturesReader = vHNMFeaturesReader;
this.vIntonationReader = vIntonationReader;
this.f0ContourImposeSupport = imposeF0Support;
initializeParameters();
}
/**
* intialize hnm parameters
*/
private void initializeParameters() {
// Analysis parameters
analysisParams = new HntmAnalyzerParams();
analysisParams.harmonicModel = HntmAnalyzerParams.HARMONICS_PLUS_NOISE;
analysisParams.noiseModel = HntmAnalyzerParams.WAVEFORM;
analysisParams.useHarmonicAmplitudesDirectly = true;
analysisParams.harmonicSynthesisMethodBeforeNoiseAnalysis = HntmSynthesizerParams.LINEAR_PHASE_INTERPOLATION;
analysisParams.regularizedCepstrumWarpingMethod = RegularizedCepstrumEstimator.REGULARIZED_CEPSTRUM_WITH_POST_MEL_WARPING;
// Synthesis parameters
synthesisParams = new HntmSynthesizerParams();
synthesisParams.harmonicPartSynthesisMethod = HntmSynthesizerParams.LINEAR_PHASE_INTERPOLATION;
// synthesisParams.harmonicPartSynthesisMethod = HntmSynthesizerParams.QUADRATIC_PHASE_INTERPOLATION;
synthesisParams.overlappingHarmonicPartSynthesis = false;
synthesisParams.harmonicSynthesisOverlapInSeconds = 0.010f;
/* to output just one file */
synthesisParams.writeHarmonicPartToSeparateFile = false;
synthesisParams.writeNoisePartToSeparateFile = false;
synthesisParams.writeTransientPartToSeparateFile = false;
synthesisParams.writeOriginalMinusHarmonicPartToSeparateFile = false;
}
/**
* Synthesize given vocalization (i.e. unit-selection)
*
* @param backchannelNumber
* unit index
* @param aft
* audio file format
* @return AudioInputStream of synthesized vocalization
* @throws SynthesisException
* if failed to synthesize vocalization
*/
@Override
public AudioInputStream synthesize(int backchannelNumber, AudioFileFormat aft) throws SynthesisException {
int numberOfBackChannels = unitFileReader.getNumberOfUnits();
if (backchannelNumber >= numberOfBackChannels) {
throw new IllegalArgumentException("This voice has " + numberOfBackChannels
+ " backchannels only. so it doesn't support unit number " + backchannelNumber);
}
VocalizationUnit bUnit = unitFileReader.getUnit(backchannelNumber);
long start = bUnit.startTime;
int duration = bUnit.duration;
Datagram[] frames = null;
try {
frames = audioTimeline.getDatagrams(start, duration);
} catch (IOException e) {
throw new SynthesisException("Can not read data from timeline file " + e);
}
// Generate audio from frames
LinkedList datagrams = new LinkedList();
datagrams.addAll(Arrays.asList(frames));
DoubleDataSource audioSource = new DatagramDoubleDataSource(datagrams);
// audioSource.getAllData();
return (new DDSAudioInputStream(new BufferedDoubleDataSource(audioSource), aft.getFormat()));
}
/**
* Re-synthesize given vocalization using HNM technology
*
* @param backchannelNumber
* unit index
* @param aft
* audio file format
* @return AudioInputStream of synthesized vocalization
* @throws SynthesisException
* if failed to synthesize vocalization
*/
@Override
public AudioInputStream reSynthesize(int backchannelNumber, AudioFileFormat aft) throws SynthesisException {
float[] pScalesArray = { 1.0f };
float[] tScalesArray = { 1.0f };
float[] tScalesTimes = { 1.0f };
float[] pScalesTimes = { 1.0f };
return synthesizeUsingF0Modification(backchannelNumber, pScalesArray, pScalesTimes, tScalesArray, tScalesTimes, aft);
}
/**
* Impose target intonation contour on given vocalization using HNM technology
*
* @param sourceIndex
* unit index of vocalization
* @param targetIndex
* unit index of target intonation
* @param aft
* audio file format
* @return AudioInputStream of synthesized vocalization
* @throws SynthesisException
* if failed to synthesize vocalization
*/
@Override
public AudioInputStream synthesizeUsingImposedF0(int sourceIndex, int targetIndex, AudioFileFormat aft)
throws SynthesisException {
if (!f0ContourImposeSupport) {
throw new SynthesisException("Mary configuration of this voice doesn't support intonation contour imposition");
}
int numberOfUnits = vHNMFeaturesReader.getNumberOfUnits();
if (sourceIndex >= numberOfUnits || targetIndex >= numberOfUnits) {
throw new IllegalArgumentException("sourceIndex(" + sourceIndex + ") and targetIndex(" + targetIndex
+ ") are should be less than number of available units (" + numberOfUnits + ")");
}
double[] sourceF0 = this.vIntonationReader.getContour(sourceIndex);
double[] targetF0coeffs = this.vIntonationReader.getIntonationCoeffs(targetIndex);
double[] sourceF0coeffs = this.vIntonationReader.getIntonationCoeffs(sourceIndex);
if (targetF0coeffs == null || sourceF0coeffs == null) {
return reSynthesize(sourceIndex, aft);
}
if (targetF0coeffs.length == 0 || sourceF0coeffs.length == 0) {
return reSynthesize(sourceIndex, aft);
}
double[] targetF0 = Polynomial.generatePolynomialValues(targetF0coeffs, sourceF0.length, 0, 1);
sourceF0 = Polynomial.generatePolynomialValues(sourceF0coeffs, sourceF0.length, 0, 1);
assert targetF0.length == sourceF0.length;
float[] tScalesArray = { 1.0f };
float[] tScalesTimes = { 1.0f };
float[] pScalesArray = new float[targetF0.length];
float[] pScalesTimes = new float[targetF0.length];
double skipSizeInSeconds = this.vIntonationReader.getSkipSizeInSeconds();
double windowSizeInSeconds = this.vIntonationReader.getWindowSizeInSeconds();
for (int i = 0; i < targetF0.length; i++) {
pScalesArray[i] = (float) (targetF0[i] / sourceF0[i]);
pScalesTimes[i] = (float) (i * skipSizeInSeconds + 0.5 * windowSizeInSeconds);
}
return synthesizeUsingF0Modification(sourceIndex, pScalesArray, pScalesTimes, tScalesArray, tScalesTimes, aft);
}
/**
* modify intonation contour using HNM technology
*
* @param backchannelNumber
* unit index of vocalization
* @param pScalesArray
* pitch scales array
* @param pScalesTimes
* pitch scale times
* @param tScalesArray
* time scales array
* @param tScalesTimes
* time scale times
* @param aft
* audio file format
* @return AudioInputStream of synthesized vocalization
* @throws SynthesisException
* if failed to synthesize vocalization
*/
private AudioInputStream synthesizeUsingF0Modification(int backchannelNumber, float[] pScalesArray, float[] pScalesTimes,
float[] tScalesArray, float[] tScalesTimes, AudioFileFormat aft) throws SynthesisException {
if (backchannelNumber > vHNMFeaturesReader.getNumberOfUnits()) {
throw new IllegalArgumentException("requesting unit should not be more than number of units");
}
if (!f0ContourImposeSupport) {
throw new SynthesisException("Mary configuration of this voice doesn't support intonation contour imposition");
}
BasicProsodyModifierParams pmodParams = new BasicProsodyModifierParams(tScalesArray, tScalesTimes, pScalesArray,
pScalesTimes); // Prosody from modification factors above
HntmSpeechSignal hnmSignal = vHNMFeaturesReader.getHntmSpeechSignal(backchannelNumber);
HntmSynthesizer hs = new HntmSynthesizer();
HntmSynthesizedSignal xhat = hs.synthesize(hnmSignal, null, null, pmodParams, null, analysisParams, synthesisParams);
AudioFormat af;
if (aft == null) { // default audio format
float sampleRate = 16000.0F; // 8000,11025,16000,22050,44100
int sampleSizeInBits = 16; // 8,16
int channels = 1; // 1,2
boolean signed = true; // true,false
boolean bigEndian = false; // true,false
af = new AudioFormat(sampleRate, sampleSizeInBits, channels, signed, bigEndian);
} else {
af = aft.getFormat();
}
double[] audio_double = xhat.output;
/* Normalise the signal before return, this will normalise between 1 and -1 */
double MaxSample = MathUtils.getAbsMax(audio_double);
for (int i = 0; i < audio_double.length; i++) {
audio_double[i] = 0.3 * (audio_double[i] / MaxSample);
}
// DDSAudioInputStream oais = new DDSAudioInputStream(new BufferedDoubleDataSource(audio_double), aft.getFormat());
DDSAudioInputStream oais = new DDSAudioInputStream(new BufferedDoubleDataSource(audio_double), af);
return oais;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy