org.daisy.pipeline.tts.calabash.impl.SSMLtoAudio Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of tts-common Show documentation
Show all versions of tts-common Show documentation
Common API for TTS functionality
package org.daisy.pipeline.tts.calabash.impl;
import java.io.File;
import java.math.BigDecimal;
import java.math.MathContext;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.PriorityBlockingQueue;
import java.util.HashMap;
import java.util.IllformedLocaleException;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import javax.sound.sampled.AudioFileFormat;
import net.sf.saxon.s9api.Axis;
import net.sf.saxon.s9api.Processor;
import net.sf.saxon.s9api.XdmNode;
import net.sf.saxon.s9api.XdmSequenceIterator;
import org.daisy.common.messaging.MessageAppender;
import org.daisy.common.properties.Properties;
import org.daisy.common.properties.Properties.Property;
import org.daisy.pipeline.audio.AudioServices;
import org.daisy.pipeline.tts.AudioFootprintMonitor;
import org.daisy.pipeline.tts.calabash.impl.EncodingThread.EncodingException;
import org.daisy.pipeline.tts.config.VoiceConfigExtension;
import org.daisy.pipeline.tts.DefaultSSMLMarkSplitter;
import org.daisy.pipeline.tts.SSMLMarkSplitter;
import org.daisy.pipeline.tts.Sentence;
import org.daisy.pipeline.tts.TTSEngine;
import org.daisy.pipeline.tts.TTSLog;
import org.daisy.pipeline.tts.TTSLog.ErrorCode;
import org.daisy.pipeline.tts.TTSRegistry;
import org.daisy.pipeline.tts.TTSService.SynthesisException;
import org.daisy.pipeline.tts.TimedTTSExecutor;
import org.daisy.pipeline.tts.Voice;
import org.daisy.pipeline.tts.VoiceInfo.Gender;
import org.daisy.pipeline.tts.VoiceManager;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Iterables;
/**
* SSMLtoAudio splits the input SSML sentences into sections called
* ContiguousText. Every section contains a list of contiguous sentences that
* may be eventually stored into different audio files, but it is guaranteed
* that such audio files won't contain sentences of other sections in such a
* manner that the list of audio files will mirror the document order. Within
* a section, every sentence has the same single sample rate to prevent us from
* resampling the audio data before sending them to encoders.
*
* Once all the sentences have been assigned to TTS voices, they are sorted by
* size and stored in a shared queue of ContiguousText (actually, only the
* smallest sentences are sorted that way). SSMLtoAudio creates threads to
* consume this queue.
*
* The TextToPcmThreads send PCM data to EncodingThreads via a queue of
* ContiguousPCM. These PCM packets are then processed from the longest (with
* respect to the number of samples) to the shortest in order to make it likely
* that the threads will finish at the same time. When all the TextToPcmThreads
* are joined, the pipeline pushes an EndOfQueue marker to every EncodingThreads
* to notify that they must stop waiting for more PCM packets than the ones
* already pushed.
*
* The queue of PCM chunks, along with the queues of other TTS steps, share a
* global max size so as to make sure that they won't grow too much if the
* encoding is slower than the synthesizing (see AudioFootprintMonitor).
*
* If an EncodingThread fails to encode samples, it won't set the URI attribute
* of the audio chunks. In that way, SSMLtoAudio is informed not to include the
* corresponding text into the list of audio clips.
*
*/
public class SSMLtoAudio implements FormatSpecifications {
private static final Property THREADS_NUMBER = Properties.getProperty("org.daisy.pipeline.tts.threads.number",
false,
"Number of threads for speech synthesis and audio encoding",
false,
""+Runtime.getRuntime().availableProcessors());
private static final Property ENCODING_THREADS_NUMBER = Properties.getProperty("org.daisy.pipeline.tts.threads.encoding.number",
false,
"Number of threads for audio encoding",
false,
null);
private static final Property SPEAKING_THREADS_NUMBER = Properties.getProperty("org.daisy.pipeline.tts.threads.speaking.number",
false,
"Number of threads for speech synthesis",
false,
null);
private static final Property THREADS_MEMLIMIT = Properties.getProperty("org.daisy.pipeline.tts.threads.each.memlimit",
false,
"Maximum amount of memory consumed by each speech synthesis thread (in Mb)",
false,
"20"); // 20 Mb
private TTSEngine mLastTTS; //used if no TTS is found for the current sentence
private TTSRegistry mTTSRegistry;
private Logger mLogger;
private ContiguousText mCurrentSection;
private File mAudioDir; //where all the sound files will be stored
private final AudioFileFormat.Type mAudioFileFormat;
/*
* The maximum number of sentences that a section (ContiguousText) can contain.
*/
private final int mMaxSentencesPerSection;
private int mSentenceCounter = 0;
private long mTotalTextSize;
private int mDocumentPosition;
private Map> mOrganizedText;
private AudioFootprintMonitor mAudioFootprintMonitor;
private Processor mProc;
private VoiceManager mVoiceManager;
private TimedTTSExecutor mExecutor = new TimedTTSExecutor();
private final Map mProperties;
private TTSLog mTTSlog;
private int mErrorCounter;
int numberOfCores = getPropertyAsInt(THREADS_NUMBER).get(); // for unit tests to override
public SSMLtoAudio(File audioDir, AudioFileFormat.Type audioFileFormat,
int maxSentencesPerSection, TTSRegistry ttsregistry, Logger logger,
AudioFootprintMonitor audioFootprintMonitor, Processor proc,
Map properties, VoiceConfigExtension configExt, TTSLog logs) {
mTTSRegistry = ttsregistry;
mLogger = logger;
mCurrentSection = null;
mDocumentPosition = 0;
mOrganizedText = new HashMap>();
mAudioFootprintMonitor = audioFootprintMonitor;
mProc = proc;
mAudioDir = audioDir;
mAudioFileFormat = audioFileFormat;
mMaxSentencesPerSection = maxSentencesPerSection;
mTTSlog = logs;
/*
* initialize the TTS engines
*/
mProperties = properties;
mVoiceManager = new VoiceManager(
ttsregistry.getWorkingEngines(properties, mTTSlog, mLogger),
configExt.getVoiceDeclarations());
}
/**
* @param node Document node of SSML document
*/
public void feedSSML(XdmNode doc) throws SynthesisException {
XdmSequenceIterator iter = doc.axisIterator(Axis.CHILD);
if (iter.hasNext()) {
traverse((XdmNode)iter.next(), null);
}
endSection();
}
private void traverse(XdmNode node, Locale lang) throws SynthesisException {
if (SentenceTag.equals(node.getNodeName())) {
if (!dispatchSSML(node, lang))
mErrorCounter++;
if (mMaxSentencesPerSection > 0 && ++mSentenceCounter % mMaxSentencesPerSection == 0)
endSection();
} else {
String langAttr = node.getAttributeValue(Sentence_attr_lang);
if (langAttr != null) {
try {
lang = (new Locale.Builder()).setLanguageTag(langAttr).build();
} catch (IllformedLocaleException e) {
lang = null;
}
}
XdmSequenceIterator iter = node.axisIterator(Axis.CHILD);
while (iter.hasNext())
traverse((XdmNode)iter.next(), lang);
}
}
/**
* The SSML is assumed to be pushed in document order.
*
* @param ssml The input SSML
* @param lang The parent language
* @return true when the SSML was successfully converted to speech, false when there was an error
**/
// package private for tests
boolean dispatchSSML(XdmNode ssml, Locale lang) throws SynthesisException {
String voiceEngine = ssml.getAttributeValue(Sentence_attr_select1);
String voiceName = ssml.getAttributeValue(Sentence_attr_select2);
Gender gender; {
String attr = ssml.getAttributeValue(Sentence_attr_gender);
String ageAttr = ssml.getAttributeValue(Sentence_attr_age);
if (attr != null && ageAttr != null) {
try {
int age = Integer.parseInt(ageAttr);
if (age <= 16) {
gender = Gender.of(attr + "-child");
} else if (age >= 70) {
gender = Gender.of(attr + "-eldery");
} else {
gender = Gender.of(attr);
}
} catch (NumberFormatException e) {
gender = Gender.of(attr);
}
} else {
gender = Gender.of(attr);
}
}
{
String langAttr = ssml.getAttributeValue(Sentence_attr_lang);
if (langAttr != null) {
try {
lang = (new Locale.Builder()).setLanguageTag(langAttr).build();
} catch (IllformedLocaleException e) {
lang = null;
}
}
}
String id = ssml.getAttributeValue(Sentence_attr_id);
TTSLog.Entry logEntry = mTTSlog.getOrCreateEntry(id);
logEntry.setSSML(ssml);
Iterable voices = mVoiceManager.findAvailableVoices(voiceEngine, voiceName, lang, gender);
Voice preferredVoice = Iterables.getFirst(voices, null);
logEntry.setSelectedVoice(preferredVoice);
if (preferredVoice == null) {
String err = "could not find any installed voice matching with "
+ new Voice(voiceEngine, voiceName)
+ " or providing the language '" + lang + "'";
logEntry.addError(new TTSLog.Error(TTSLog.ErrorCode.AUDIO_MISSING, err));
return false;
}
TTSEngine newSynth = mVoiceManager.getTTS(preferredVoice);
if (newSynth == null) {
/*
* Should not happen since findAvailableVoice() returns only a
* non-null voice if a TTSService can provide it
*/
String err = "could not find any TTS engine for the voice "
+ new Voice(voiceEngine, voiceName);
logEntry.addError(new TTSLog.Error(TTSLog.ErrorCode.AUDIO_MISSING, err));
return false;
}
if (!mVoiceManager.matches(preferredVoice, voiceEngine, voiceName, lang, gender)) {
logEntry.addError(new TTSLog.Error(TTSLog.ErrorCode.UNEXPECTED_VOICE,
"no voice matches exactly with the requested characteristics"));
}
/*
* If a TTS engine has no reserved threads, its sentences are pushed to
* a global list whose key is null. Otherwise the TTS Service is used as
* key.
*/
TTSEngine poolkey = null;
if (newSynth.reservedThreadNum() > 0) {
poolkey = newSynth;
}
if (newSynth != mLastTTS) {
if (mLastTTS != null
&& (poolkey != null || mLastTTS.reservedThreadNum() > 0))
endSection(); // necessary because the same thread wouldn't be able to
// concatenate outputs of different formats
mLastTTS = newSynth;
}
if (mCurrentSection == null) {
//happen the first time and whenever endSection() is called
mCurrentSection = new ContiguousText(mDocumentPosition++, mAudioDir);
List listOfSections = mOrganizedText.get(poolkey);
if (listOfSections == null) {
listOfSections = new ArrayList();
mOrganizedText.put(poolkey, listOfSections);
}
listOfSections.add(mCurrentSection);
}
mCurrentSection.sentences.add(new Sentence(newSynth, voices, ssml));
return true;
}
private void endSection() {
mCurrentSection = null;
}
public Iterable blockingRun(AudioServices audioServices)
throws SynthesisException, InterruptedException, EncodingException {
MessageAppender activeBlock = MessageAppender.getActiveBlock(); // px:ssml-to-audio step
//SSML mark splitter shared by the threads:
SSMLMarkSplitter ssmlSplitter = new DefaultSSMLMarkSplitter(mProc); // StructuredSSMLSplitter makes assumptions
// that are not always met, e.g. marks may
// be contained within a a prosody element
reorganizeSections();
//threading layout
int reservedThreadNum = 0;
for (TTSEngine tts : mOrganizedText.keySet()) {
if (tts != null && tts.reservedThreadNum() > 0)
reservedThreadNum += tts.reservedThreadNum();
}
int ttsThreadNum = numberOfCores;
int encodingThreadNum = getPropertyAsInt(ENCODING_THREADS_NUMBER).orElse(ttsThreadNum);
int regularTTSthreadNum = getPropertyAsInt(SPEAKING_THREADS_NUMBER).orElse(ttsThreadNum);
int totalTTSThreads = regularTTSthreadNum + reservedThreadNum;
int maxMemPerTTSThread = getPropertyAsInt(THREADS_MEMLIMIT).get() * 1048576;
mLogger.info("Number of encoding threads: " + encodingThreadNum);
mLogger.info("Number of regular text-to-speech threads: " + regularTTSthreadNum);
mLogger.info("Number of reserved text-to-speech threads: " + reservedThreadNum);
mLogger.info("Max TTS memory footprint (encoding excluded): "
+ mAudioFootprintMonitor.getSpaceForTTS() / 1000000 + "MB");
mLogger.info("Max encoding memory footprint: "
+ mAudioFootprintMonitor.getSpaceForEncoding() / 1000000 + "MB");
//input queue common to all the threads
BlockingQueue pcmQueue = new PriorityBlockingQueue();
//start the TTS threads
TextToPcmThread[] tpt = new TextToPcmThread[totalTTSThreads];
List text = mOrganizedText.get(null);
if (text == null) {
text = Collections.EMPTY_LIST;
}
int i = 0;
ConcurrentLinkedQueue stext = new ConcurrentLinkedQueue(text);
if (regularTTSthreadNum > 0) {
long textSize = stext.stream().mapToLong(ContiguousText::getStringSize).sum();
BigDecimal portion = textSize == 0
? BigDecimal.ZERO
: new BigDecimal(textSize).divide(new BigDecimal(mTotalTextSize), MathContext.DECIMAL128)
.divide(new BigDecimal(regularTTSthreadNum), MathContext.DECIMAL128);
for (; i < regularTTSthreadNum; ++i) {
tpt[i] = new TextToPcmThread();
tpt[i].start(stext, pcmQueue, mExecutor, mTTSRegistry, mVoiceManager, ssmlSplitter,
mLogger, mAudioFootprintMonitor, maxMemPerTTSThread, mTTSlog, activeBlock,
mTotalTextSize, portion);
}
}
for (Map.Entry> e : mOrganizedText.entrySet()) {
TTSEngine tts = e.getKey();
if (tts != null) { //tts = null is handled by the previous loop
stext = new ConcurrentLinkedQueue(e.getValue());
if (tts.reservedThreadNum() > 0) {
long textSize = stext.stream().mapToLong(ContiguousText::getStringSize).sum();
BigDecimal portion = textSize == 0
? BigDecimal.ZERO
: new BigDecimal(textSize).divide(new BigDecimal(mTotalTextSize), MathContext.DECIMAL128)
.divide(new BigDecimal(tts.reservedThreadNum()), MathContext.DECIMAL128);
for (int j = 0; j < tts.reservedThreadNum(); ++i, ++j) {
tpt[i] = new TextToPcmThread();
tpt[i].start(stext, pcmQueue, mExecutor, mTTSRegistry, mVoiceManager, ssmlSplitter,
mLogger, mAudioFootprintMonitor, maxMemPerTTSThread, mTTSlog,
activeBlock, mTotalTextSize, portion);
}
}
}
}
mLogger.info("Text-to-speech threads started.");
//start the encoding threads
EncodingThread[] encodingTh = new EncodingThread[encodingThreadNum];
for (int j = 0; j < encodingTh.length; ++j) {
encodingTh[j] = new EncodingThread();
encodingTh[j].start(mAudioFileFormat, audioServices, pcmQueue,
mAudioFootprintMonitor, mProperties, mTTSlog, activeBlock);
}
mLogger.info("Encoding threads started.");
//collect the sound fragments
Collection[] fragments = new Collection[tpt.length];
for (int j = 0; j < tpt.length; ++j) {
fragments[j] = tpt[j].getSoundFragments();
}
for (int j = 0; j < tpt.length; ++j) {
mErrorCounter += tpt[j].getErrorCount();
}
//send END notifications and wait for the encoding threads to finish
mLogger.info("Text-to-speech finished. Waiting for audio encoding to finish...");
for (int k = 0; k < encodingTh.length; ++k) {
pcmQueue.add(ContiguousPCM.EndOfQueue);
}
for (int j = 0; j < encodingTh.length; ++j) {
try {
encodingTh[j].waitToFinish();
} catch (EncodingException e) {
while (++j < encodingTh.length)
// FIXME: interrupt instead of waiting
try { encodingTh[j].waitToFinish(); }
catch (EncodingException _e) {}
throw e;
}
}
mLogger.info("Audio encoding finished.");
return Iterables.concat(fragments);
}
double getErrorRate() {
return (double)mErrorCounter/mSentenceCounter;
}
private void reorganizeSections() {
mTotalTextSize = 0;
int sectionCount = 0;
for (List sections : mOrganizedText.values()) {
// compute the sections' size: needed for displaying the progress,
// splitting the sections and sorting them
for (ContiguousText section : sections) {
section.computeSize();
mTotalTextSize += section.getStringSize();
}
// split up the sections that are too big
// FIXME: Explain what is "too big".
// FIXME: Code was disabled for now because it does not make sense to let the maximum
// size depend on the total size (and mTotalTextSize isn't even constant).
/*int maxSize = (int) (mTotalTextSize / 15);
List newSections = new ArrayList();
List toRemove = new ArrayList();
for (ContiguousText section : sections) {
if (section.getStringSize() >= maxSize) {
toRemove.add(section);
splitSection(section, maxSize, newSections);
}
}
sections.removeAll(toRemove);
sections.addAll(newSections);*/
// sort the sections according to their size in descending order
// FIXME: Explain why this is done.
Collections.sort(sections);
// keep sorted only the smallest sections (50% of total) so the biggest sections won't
// necessarily be processed at the same time, as that may consume too much memory.
Collections.shuffle(sections.subList(0, sections.size() / 2));
sectionCount += sections.size();
}
mLogger.debug("Number of synthesizable TTS sections: " + sectionCount);
}
//we can dispense with this function as soon as we take into consideration the size of the SSML
//sentences, rather than creating a new section every 10 sentences or so.
private void splitSection(ContiguousText section, int maxSize,
List newSections) {
int left = 0;
int count = 0;
ContiguousText currentSection = new ContiguousText(section.getDocumentPosition(),
section.getAudioOutputDir());
currentSection.setStringsize(0);
for (int right = 0; right < section.sentences.size(); ++right) {
if (currentSection.getStringSize() > maxSize) {
currentSection.sentences = section.sentences.subList(left, right);
currentSection.setDocumentSplitPosition(count);
newSections.add(currentSection);
currentSection = new ContiguousText(section.getDocumentPosition(), section
.getAudioOutputDir());
currentSection.setStringsize(0);
left = right;
++count;
}
currentSection.setStringsize(currentSection.getStringSize()
+ section.sentences.get(right).getSize());
}
currentSection.sentences = section.sentences.subList(left, section.sentences.size());
currentSection.setDocumentSplitPosition(count);
newSections.add(currentSection);
}
private Optional getPropertyAsInt(Property prop) {
String str = prop.getValue(mProperties);
if (str != null) {
try {
return Optional.of(Integer.valueOf(str));
} catch (NumberFormatException e) {
mTTSlog.addGeneralError(
ErrorCode.WARNING, str + " is not a valid value for property " + prop.getName(), e);
}
}
return Optional.empty();
}
}