marytts.unitselection.interpolation.InterpolatingSynthesizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of marytts-runtime Show documentation
The newest version!
/**
 * Copyright 2007 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * This file is part of MARY TTS.
 *
 * MARY TTS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see .
 *
 */
package marytts.unitselection.interpolation;

import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.Iterator;
import java.util.List;

import javax.sound.sampled.AudioInputStream;

import marytts.datatypes.MaryXML;
import marytts.exceptions.SynthesisException;
import marytts.modules.synthesis.Voice;
import marytts.modules.synthesis.WaveformSynthesizer;
import marytts.signalproc.process.FramewiseMerger;
import marytts.signalproc.process.LSFInterpolator;
import marytts.unitselection.UnitSelectionVoice;
import marytts.unitselection.concat.BaseUnitConcatenator;
import marytts.unitselection.concat.UnitConcatenator;
import marytts.unitselection.select.SelectedUnit;
import marytts.unitselection.select.UnitSelector;
import marytts.util.MaryUtils;
import marytts.util.data.BufferedDoubleDataSource;
import marytts.util.data.Datagram;
import marytts.util.data.DoubleDataSource;
import marytts.util.data.audio.AudioDoubleDataSource;
import marytts.util.data.audio.DDSAudioInputStream;
import marytts.util.dom.MaryDomUtils;

import org.apache.log4j.Logger;
import org.w3c.dom.Element;

/**
 * @author marc
 *
 */
public class InterpolatingSynthesizer implements WaveformSynthesizer {
	protected Logger logger;

	/**
     * 
     */
	public InterpolatingSynthesizer() {
	}

	/**
	 * Start up the waveform synthesizer. This must be called once before calling synthesize().
	 * 
	 * @throws Exception
	 *             Exception
	 */
	public void startup() throws Exception {
		logger = MaryUtils.getLogger("InterpolatingSynthesizer");
		// Register interpolating voice:
		Voice.registerVoice(new InterpolatingVoice(this, "interpolatingvoice"));
		logger.info("started.");
	}

	/**
	 * Perform a power-on self test by processing some example input data.
	 * 
	 * @throws Error
	 *             if the module does not work properly.
	 */
	public void powerOnSelfTest() throws Error {
	}

	/**
	 * {@inheritDoc}
	 * 
	 * @param tokensAndBoundaries
	 *            tokensAndBoundaries
	 * @param voice
	 *            voice
	 * @param outputParams
	 *            outputParams
	 * @throws SynthesisException
	 *             SynthesisException
	 * @return outputAudio
	 */
	public AudioInputStream synthesize(List tokensAndBoundaries, Voice voice, String outputParams)
			throws SynthesisException {
		if (tokensAndBoundaries.size() == 0)
			return null;

		// 1. determine the two voices involved;
		Element first = (Element) tokensAndBoundaries.get(0);
		Element voiceElement = (Element) MaryDomUtils.getAncestor(first, MaryXML.VOICE);
		String name = voiceElement.getAttribute("name");
		// name has the form: voice1 with XY% voice2
		// We trust that InerpolatingVoice.hasName() has done its job to verify that
		// name actually has that form.
		String[] parts = name.split("\\s+");
		assert parts.length == 4;
		assert parts[1].equals("with");
		assert parts[2].endsWith("%");
		int percent = Integer.parseInt(parts[2].substring(0, parts[2].length() - 1));
		Voice voice1 = Voice.getVoice(parts[0]);
		assert voice1 != null;
		Voice voice2 = Voice.getVoice(parts[3]);
		assert voice2 != null;

		// 2. do unit selection with each;
		if (!(voice1 instanceof UnitSelectionVoice)) {
			throw new IllegalArgumentException("Voices of type " + voice.getClass().getName() + " not supported!");
		}
		if (!(voice2 instanceof UnitSelectionVoice)) {
			throw new IllegalArgumentException("Voices of type " + voice.getClass().getName() + " not supported!");
		}
		UnitSelectionVoice usv1 = (UnitSelectionVoice) voice1;
		UnitSelectionVoice usv2 = (UnitSelectionVoice) voice2;

		UnitSelector unitSel1 = usv1.getUnitSelector();
		List selectedUnits1 = unitSel1.selectUnits(tokensAndBoundaries, voice);
		UnitSelector unitSel2 = usv2.getUnitSelector();
		List selectedUnits2 = unitSel2.selectUnits(tokensAndBoundaries, voice);
		assert selectedUnits1.size() == selectedUnits2.size() : "Unexpected difference in number of units: "
				+ selectedUnits1.size() + " vs. " + selectedUnits2.size();
		int numUnits = selectedUnits1.size();

		// 3. do unit concatenation with each, retrieve actual unit durations from list of units;
		UnitConcatenator unitConcatenator1 = usv1.getConcatenator();
		AudioInputStream audio1;
		try {
			audio1 = unitConcatenator1.getAudio(selectedUnits1);
		} catch (IOException ioe) {
			StringWriter sw = new StringWriter();
			PrintWriter pw = new PrintWriter(sw);
			for (Iterator selIt = selectedUnits1.iterator(); selIt.hasNext();)
				pw.println(selIt.next());
			throw new SynthesisException("For voice " + voice1.getName() + ", problems generating audio for unit chain: "
					+ sw.toString(), ioe);
		}
		DoubleDataSource audioSource1 = new AudioDoubleDataSource(audio1);
		UnitConcatenator unitConcatenator2 = usv2.getConcatenator();
		AudioInputStream audio2;
		try {
			audio2 = unitConcatenator2.getAudio(selectedUnits2);
		} catch (IOException ioe) {
			StringWriter sw = new StringWriter();
			PrintWriter pw = new PrintWriter(sw);
			for (Iterator selIt = selectedUnits2.iterator(); selIt.hasNext();)
				pw.println(selIt.next());
			throw new SynthesisException("For voice " + voice2.getName() + ", problems generating audio for unit chain: "
					+ sw.toString(), ioe);
		}
		DoubleDataSource audioSource2 = new AudioDoubleDataSource(audio2);
		// Retrieve actual durations from list of units:
		int sampleRate1 = (int) usv1.dbAudioFormat().getSampleRate();
		double[] label1 = new double[numUnits];
		double t1 = 0;
		int sampleRate2 = (int) usv2.dbAudioFormat().getSampleRate();
		double[] label2 = new double[numUnits];
		double t2 = 0;
		for (int i = 0; i < numUnits; i++) {
			SelectedUnit u1 = selectedUnits1.get(i);
			SelectedUnit u2 = selectedUnits2.get(i);
			BaseUnitConcatenator.UnitData ud1 = (BaseUnitConcatenator.UnitData) u1.getConcatenationData();
			int unitDuration1 = ud1.getUnitDuration();
			if (unitDuration1 < 0) { // was not set by the unit concatenator, have to count ourselves
				unitDuration1 = 0;
				Datagram[] d = ud1.getFrames();
				for (int id = 0; id < d.length; id++) {
					unitDuration1 += d[id].getDuration();
				}
			}
			t1 += unitDuration1 / (float) sampleRate1;
			label1[i] = t1;
			BaseUnitConcatenator.UnitData ud2 = (BaseUnitConcatenator.UnitData) u2.getConcatenationData();
			int unitDuration2 = ud2.getUnitDuration();
			if (unitDuration2 < 0) { // was not set by the unit concatenator, have to count ourselves
				unitDuration2 = 0;
				Datagram[] d = ud2.getFrames();
				for (int id = 0; id < d.length; id++) {
					unitDuration2 += d[id].getDuration();
				}
			}
			t2 += unitDuration2 / (float) sampleRate2;
			label2[i] = t2;
			logger.debug(usv1.getName() + " [" + u1.getTarget() + "] " + label1[i] + " -- " + usv2.getName() + " ["
					+ u2.getTarget() + "] " + label2[i]);
		}

		// 4. with these unit durations, run the LSFInterpolator on the two audio streams.
		int frameLength = Integer.getInteger("signalproc.lpcanalysisresynthesis.framelength", 512).intValue();
		int predictionOrder = Integer.getInteger("signalproc.lpcanalysisresynthesis.predictionorder", 20).intValue();
		double r = (double) percent / 100;
		assert r >= 0;
		assert r <= 1;
		FramewiseMerger foas = new FramewiseMerger(audioSource1, frameLength, sampleRate1, new BufferedDoubleDataSource(label1),
				audioSource2, sampleRate2, new BufferedDoubleDataSource(label2), new LSFInterpolator(predictionOrder, r));
		DDSAudioInputStream outputAudio = new DDSAudioInputStream(new BufferedDoubleDataSource(foas), audio1.getFormat());

		return outputAudio;
	}

}