marytts.tools.analysis.CopySynthesis Maven / Gradle / Ivy
The newest version!
/**
* Copyright 2009 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see .
*
*/
package marytts.tools.analysis;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.util.Arrays;
import java.util.Formatter;
import java.util.Locale;
import java.util.regex.Pattern;
import javax.sound.sampled.AudioInputStream;
import javax.sound.sampled.AudioSystem;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import marytts.client.http.MaryHttpClient;
import marytts.datatypes.MaryXML;
import marytts.modules.phonemiser.AllophoneSet;
import marytts.signalproc.analysis.F0TrackerAutocorrelationHeuristic;
import marytts.signalproc.analysis.Label;
import marytts.signalproc.analysis.Labels;
import marytts.signalproc.analysis.PitchFileHeader;
import marytts.signalproc.analysis.PitchReaderWriter;
import marytts.util.data.audio.AudioDoubleDataSource;
import marytts.util.data.text.LabelfileDoubleDataSource;
import marytts.util.data.text.PraatPitchTier;
import marytts.util.dom.DomUtils;
import marytts.util.dom.MaryDomUtils;
import marytts.util.string.StringUtils;
import org.apache.commons.io.FileUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.traversal.NodeIterator;
/**
* Impose duration and/or intonation from one version of an utterance to another version. The segmental chain of both utterances
* can have tiny deviations but must be sufficiently similar to allow the MaryTranscriptionAligner to match the two. The
* destination must be in ALLOPHONES or ACOUSTPARAMS format, i.e. have syllabic and segmental substructure in the tokens. The
* source can be in a number of formats.
*
* @author marc
*
*/
public class CopySynthesis {
AllophoneSet allophoneSet;
/**
* Provide copy synthesis functionality for documents using the given allophone set.
*
* @param allophoneSet
* the allophone set to use, or null if no allophone verification is needed.
*/
public CopySynthesis(AllophoneSet allophoneSet) {
this.allophoneSet = allophoneSet;
}
/**
* Make sure that the label sequence as provided in source is copied into the target document.
*
* @param source
* a label sequence consisting of valid allophones according to the allophone set given in the constructor.
* @param target
* a MaryXML document at the stage of ALLOPHONES or ACOUSTPARAMS, i.e. with syllable/segment substructure in
* phones.
*/
public void imposeSegments(Labels source, Document target) {
MaryTranscriptionAligner aligner = new MaryTranscriptionAligner(allophoneSet, true);
aligner.SetEnsureInitialBoundary(false);
String labels = StringUtils.join(aligner.getEntrySeparator(), source.getLabelSymbols());
aligner.alignXmlTranscriptions(target, labels);
}
/**
* Make sure that the label sequence as provided in source is copied into the target document, and that the phone and boundary
* durations in the target are adjusted to those in source.
*
* @param source
* a label sequence consisting of valid allophones according to the allophone set given in the constructor.
* @param target
* a MaryXML document at the stage of ALLOPHONES or ACOUSTPARAMS, i.e. with syllable/segment substructure in
* phones.
*/
public void imposeDurations(Labels source, Document target) {
imposeSegments(source, target);
// we trust that the following holds: (see CopySynthesisTest):
// assert Arrays.deepEquals(source.getLabelSymbols(), new Labels(target).getLabelSymbols());
// or in other words, the number of labels in source are now the same as the number of
// phone and boundary items in target.
String PHONE = "ph";
String BOUNDARY = "boundary";
NodeIterator it = DomUtils.createNodeIterator(target, PHONE, BOUNDARY);
Element e = null;
double prevEndTime = 0;
int iSource = 0;
while ((e = (Element) it.nextNode()) != null) {
updateDurationAndEndTime(e, source.items[iSource], prevEndTime);
prevEndTime = source.items[iSource].time;
iSource++;
}
}
private void updateDurationAndEndTime(Element e, Label label, double prevEndTime) {
String PHONE = "ph";
String A_PHONE_DURATION = "d";
String A_PHONE_SYMBOL = "p";
String A_PHONE_END = "end";
String BOUNDARY = "boundary";
String A_BOUNDARY_DURATION = "duration";
assert label.time > prevEndTime;
double durationSeconds = label.time - prevEndTime;
String durationMillisString = String.valueOf((int) Math.round(1000 * durationSeconds));
// System.out.println("For label "+label+", setting duration "+durationSeconds+" -> "+durationMillisString);
if (e.getTagName().equals(PHONE)) {
assert label.phn.equals(e.getAttribute(A_PHONE_SYMBOL));
e.setAttribute(A_PHONE_DURATION, durationMillisString);
e.setAttribute(A_PHONE_END, String.valueOf(label.time));
} else {
assert e.getTagName().equals(BOUNDARY);
e.setAttribute(A_BOUNDARY_DURATION, durationMillisString);
}
}
/**
* Make sure that 1. the label sequence as provided in source is copied into the target document, 2. the phone and boundary
* durations in the target are adjusted to those in source, and 3. the intonation targets in the target are replaced by those
* in the pitchSource.
*
* @param durationAndSegmentSource
* a label sequence consisting of valid allophones according to the allophone set given in the constructor.
* @param pitchSource
* a specification of an intonation contour.
* @param target
* a MaryXML document at the stage of ALLOPHONES or ACOUSTPARAMS, i.e. with syllable/segment substructure in
* phones.
*/
public void imposeIntonation(Labels durationAndSegmentSource, PraatPitchTier pitchSource, Document target) {
throw new RuntimeException("Not yet implemented");
}
/**
* @param args
* args
* @throws Exception
* Exception
*/
public static void main(String[] args) throws Exception {
String wavFilename = null;
String labFilename = null;
String pitchFilename = null;
String textFilename = null;
String locale = System.getProperty("locale");
if (locale == null) {
throw new IllegalArgumentException("No locale given (-Dlocale=...)");
}
for (String arg : args) {
if (arg.endsWith(".txt"))
textFilename = arg;
else if (arg.endsWith(".wav"))
wavFilename = arg;
else if (arg.endsWith(".ptc"))
pitchFilename = arg;
else if (arg.endsWith(".lab"))
labFilename = arg;
else
throw new IllegalArgumentException("Don't know how to treat argument: " + arg);
}
// The intonation contour
double[] contour = null;
double frameShiftTime = -1;
if (pitchFilename == null) { // need to create pitch contour from wav file
if (wavFilename == null) {
throw new IllegalArgumentException("Need either a pitch file or a wav file");
}
AudioInputStream ais = AudioSystem.getAudioInputStream(new File(wavFilename));
AudioDoubleDataSource audio = new AudioDoubleDataSource(ais);
PitchFileHeader params = new PitchFileHeader();
params.fs = (int) ais.getFormat().getSampleRate();
F0TrackerAutocorrelationHeuristic tracker = new F0TrackerAutocorrelationHeuristic(params);
tracker.pitchAnalyze(audio);
frameShiftTime = tracker.getSkipSizeInSeconds();
contour = tracker.getF0Contour();
} else { // have a pitch file -- ignore any wav file
PitchReaderWriter f0rw = new PitchReaderWriter(pitchFilename);
if (f0rw.contour == null) {
throw new NullPointerException("Cannot read f0 contour from " + pitchFilename);
}
contour = f0rw.contour;
frameShiftTime = f0rw.header.skipSizeInSeconds;
}
assert contour != null;
assert frameShiftTime > 0;
// The ALLOPHONES data and labels
if (labFilename == null) {
throw new IllegalArgumentException("No label file given");
}
if (textFilename == null) {
throw new IllegalArgumentException("No text file given");
}
MaryTranscriptionAligner aligner = new MaryTranscriptionAligner();
aligner.SetEnsureInitialBoundary(false);
String labels = MaryTranscriptionAligner.readLabelFile(aligner.getEntrySeparator(), aligner.getEnsureInitialBoundary(),
labFilename);
MaryHttpClient mary = new MaryHttpClient();
String text = FileUtils.readFileToString(new File(textFilename), "ASCII");
ByteArrayOutputStream baos = new ByteArrayOutputStream();
mary.process(text, "TEXT", "ALLOPHONES", locale, null, null, baos);
ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
docFactory.setNamespaceAware(true);
DocumentBuilder builder = docFactory.newDocumentBuilder();
Document doc = builder.parse(bais);
aligner.alignXmlTranscriptions(doc, labels);
assert doc != null;
// durations
double[] endTimes = new LabelfileDoubleDataSource(new File(labFilename)).getAllData();
assert endTimes.length == labels.split(Pattern.quote(aligner.getEntrySeparator())).length;
// Now add durations and f0 targets to document
double prevEnd = 0;
NodeIterator ni = MaryDomUtils.createNodeIterator(doc, MaryXML.PHONE, MaryXML.BOUNDARY);
for (int i = 0; i < endTimes.length; i++) {
Element e = (Element) ni.nextNode();
if (e == null)
throw new IllegalStateException("More durations than elements -- this should not happen!");
double durInSeconds = endTimes[i] - prevEnd;
int durInMillis = (int) (1000 * durInSeconds);
if (e.getTagName().equals(MaryXML.PHONE)) {
e.setAttribute("d", String.valueOf(durInMillis));
e.setAttribute("end", new Formatter(Locale.US).format("%.3f", endTimes[i]).toString());
// f0 targets at beginning, mid, and end of phone
StringBuilder f0String = new StringBuilder();
double startF0 = getF0(contour, frameShiftTime, prevEnd);
if (startF0 != 0 && !Double.isNaN(startF0)) {
f0String.append("(0,").append((int) startF0).append(")");
}
double midF0 = getF0(contour, frameShiftTime, prevEnd + 0.5 * durInSeconds);
if (midF0 != 0 && !Double.isNaN(midF0)) {
f0String.append("(50,").append((int) midF0).append(")");
}
double endF0 = getF0(contour, frameShiftTime, endTimes[i]);
if (endF0 != 0 && !Double.isNaN(endF0)) {
f0String.append("(100,").append((int) endF0).append(")");
}
if (f0String.length() > 0) {
e.setAttribute("f0", f0String.toString());
}
} else { // boundary
e.setAttribute("duration", String.valueOf(durInMillis));
}
prevEnd = endTimes[i];
}
if (ni.nextNode() != null) {
throw new IllegalStateException("More elements than durations -- this should not happen!");
}
// TODO: add pitch values
String acoustparams = DomUtils.document2String(doc);
System.out.println("ACOUSTPARAMS:");
System.out.println(acoustparams);
}
/**
* For the given sampled contour and skipSize, provide the f0 value at time t if possible or Double.NaN otherwise.
*
* @param contour
* contour
* @param skipSize
* skipSize
* @param t
* t
* @return Double.NaN
*/
private static double getF0(double[] contour, double skipSize, double t) {
int i = (int) (t / skipSize);
if (i >= 0 && i < contour.length)
return contour[i];
return Double.NaN;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy