com.alibaba.dashscope.audio.ttsv2.SpeechSynthesizer Maven / Gradle / Ivy
The newest version!
// Copyright (c) Alibaba, Inc. and its affiliates.
package com.alibaba.dashscope.audio.ttsv2;
import com.alibaba.dashscope.api.SynchronizeFullDuplexApi;
import com.alibaba.dashscope.audio.tts.SpeechSynthesisResult;
import com.alibaba.dashscope.common.*;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.InputRequiredException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.protocol.ApiServiceOption;
import com.alibaba.dashscope.protocol.ConnectionOptions;
import com.alibaba.dashscope.protocol.Protocol;
import com.alibaba.dashscope.protocol.StreamingMode;
import com.google.gson.JsonObject;
import io.reactivex.BackpressureStrategy;
import io.reactivex.Emitter;
import io.reactivex.Flowable;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.WritableByteChannel;
import java.util.LinkedList;
import java.util.Queue;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;
import lombok.Builder;
import lombok.Getter;
import lombok.NonNull;
import lombok.experimental.SuperBuilder;
import lombok.extern.slf4j.Slf4j;
/** @author lengjiayi */
@Slf4j
public final class SpeechSynthesizer {
private final Queue cmdBuffer = new LinkedList<>();
@Getter
// private final List timestamps = new ArrayList<>(); //
// Lists.newCopyOnWriteArrayList();
SynchronizeFullDuplexApi duplexApi;
private ApiServiceOption serviceOption;
private Emitter textEmitter;
private ResultCallback callback;
private SpeechSynthesisState state = SpeechSynthesisState.IDLE;
private AtomicReference stopLatch = new AtomicReference<>(null);
private SpeechSynthesisParam parameters;
@Getter private ByteBuffer audioData;
private ByteArrayOutputStream outputStream;
private AtomicReference lastRequestId = new AtomicReference<>();
private boolean isFirst = true;
private AtomicBoolean canceled = new AtomicBoolean(false);
private boolean async_call = false;
private long startStreamTimeStamp = -1;
private long firstPackageTimeStamp = -1;
private double recvAudioLength = 0;
/**
* CosyVoice Speech Synthesis SDK
*
* @param param Configuration for speech synthesis, including voice type, volume, etc.
* @param callback In non-streaming output scenarios, this can be set to null
* @param baseUrl Base URL
* @param connectionOptions Connection options
*/
public SpeechSynthesizer(
SpeechSynthesisParam param,
ResultCallback callback,
String baseUrl,
ConnectionOptions connectionOptions) {
if (param == null) {
throw new ApiException(
new InputRequiredException("Parameter invalid: StreamInputTtsParam is null"));
}
this.parameters = param;
serviceOption =
ApiServiceOption.builder()
.protocol(Protocol.WEBSOCKET)
.streamingMode(StreamingMode.DUPLEX)
.outputMode(OutputMode.ACCUMULATE)
.taskGroup(TaskGroup.AUDIO.getValue())
.task(Task.TEXT_TO_SPEECH.getValue())
.function(Function.SPEECH_SYNTHESIZER.getValue())
.baseWebSocketUrl(baseUrl)
.build();
duplexApi = new SynchronizeFullDuplexApi<>(connectionOptions, serviceOption);
this.callback = callback;
if (this.callback == null) {
this.async_call = false;
} else {
this.async_call = true;
}
}
/**
* CosyVoice Speech Synthesis SDK
*
* @param baseUrl Base URL
* @param connectionOptions Connection options
*/
public SpeechSynthesizer(String baseUrl, ConnectionOptions connectionOptions) {
this.parameters = null;
serviceOption =
ApiServiceOption.builder()
.protocol(Protocol.WEBSOCKET)
.streamingMode(StreamingMode.DUPLEX)
.outputMode(OutputMode.ACCUMULATE)
.taskGroup(TaskGroup.AUDIO.getValue())
.task(Task.TEXT_TO_SPEECH.getValue())
.function(Function.SPEECH_SYNTHESIZER.getValue())
.baseWebSocketUrl(baseUrl)
.build();
duplexApi = new SynchronizeFullDuplexApi<>(connectionOptions, serviceOption);
this.callback = null;
}
/** CosyVoice Speech Synthesis SDK */
public SpeechSynthesizer() {
this.parameters = null;
serviceOption =
ApiServiceOption.builder()
.protocol(Protocol.WEBSOCKET)
.streamingMode(StreamingMode.DUPLEX)
.outputMode(OutputMode.ACCUMULATE)
.taskGroup(TaskGroup.AUDIO.getValue())
.task(Task.TEXT_TO_SPEECH.getValue())
.function(Function.SPEECH_SYNTHESIZER.getValue())
.build();
duplexApi = new SynchronizeFullDuplexApi<>(serviceOption);
this.callback = null;
}
public void updateParamAndCallback(
SpeechSynthesisParam param, ResultCallback callback) {
this.parameters = param;
this.callback = callback;
// reset inner params
this.stopLatch = new AtomicReference<>(null);
this.cmdBuffer.clear();
this.textEmitter = null;
this.isFirst = true;
if (this.callback == null) {
this.async_call = false;
} else {
this.async_call = true;
}
}
/**
* CosyVoice Speech Synthesis SDK
*
* @param param Configuration for speech synthesis, including voice type, volume, etc.
* @param callback In non-streaming output scenarios, this can be set to null
* @param baseUrl Base URL
*/
public SpeechSynthesizer(
SpeechSynthesisParam param, ResultCallback callback, String baseUrl) {
if (param == null) {
throw new ApiException(
new InputRequiredException("Parameter invalid: StreamInputTtsParam is null"));
}
this.parameters = param;
serviceOption =
ApiServiceOption.builder()
.protocol(Protocol.WEBSOCKET)
.streamingMode(StreamingMode.DUPLEX)
.outputMode(OutputMode.ACCUMULATE)
.taskGroup(TaskGroup.AUDIO.getValue())
.task(Task.TEXT_TO_SPEECH.getValue())
.function(Function.SPEECH_SYNTHESIZER.getValue())
.baseWebSocketUrl(baseUrl)
.build();
duplexApi = new SynchronizeFullDuplexApi<>(serviceOption);
this.callback = callback;
if (this.callback == null) {
this.async_call = false;
} else {
this.async_call = true;
}
}
/**
* CosyVoice Speech Synthesis SDK
*
* @param param Configuration for speech synthesis, including voice type, volume, etc.
* @param callback In non-streaming output scenarios, this can be set to null
*/
public SpeechSynthesizer(
SpeechSynthesisParam param, ResultCallback callback) {
if (param == null) {
throw new ApiException(
new InputRequiredException("Parameter invalid: StreamInputTtsParam is null"));
}
this.parameters = param;
serviceOption =
ApiServiceOption.builder()
.protocol(Protocol.WEBSOCKET)
.streamingMode(StreamingMode.DUPLEX)
.outputMode(OutputMode.ACCUMULATE)
.taskGroup(TaskGroup.AUDIO.getValue())
.task(Task.TEXT_TO_SPEECH.getValue())
.function(Function.SPEECH_SYNTHESIZER.getValue())
.build();
duplexApi = new SynchronizeFullDuplexApi<>(serviceOption);
this.callback = callback;
if (this.callback == null) {
this.async_call = false;
} else {
this.async_call = true;
}
}
public String getLastRequestId() {
return lastRequestId.get();
}
/**
* Stream input and output speech synthesis using Flowable features
*
* @param textStream The text stream to be synthesized
* @return The output event stream, including real-time audio and timestamps
*/
public Flowable streamingCallAsFlowable(Flowable textStream)
throws ApiException, NoApiKeyException {
startStreamTimeStamp = System.currentTimeMillis();
recvAudioLength = 0;
return duplexApi
.duplexCall(
StreamInputTtsParamWithStream.fromStreamInputTtsParam(this.parameters, textStream))
.map(SpeechSynthesisResult::fromDashScopeResult)
.filter(item -> !canceled.get())
.doOnNext(
result -> {
if (result.getAudioFrame() != null) {
if (recvAudioLength == 0) {
firstPackageTimeStamp = System.currentTimeMillis();
log.debug(
"[TtsV2] first package delay: "
+ (System.currentTimeMillis() - startStreamTimeStamp)
+ " ms");
}
recvAudioLength +=
(double) result.getAudioFrame().capacity()
/ ((double) (2 * parameters.getFormat().getSampleRate()) / 1000);
long current = System.currentTimeMillis();
double current_rtf = (current - firstPackageTimeStamp) / recvAudioLength;
log.debug(
"[TtsV2] Recv Audio Binary: "
+ result.getAudioFrame().capacity()
+ " bytes, total audio "
+ recvAudioLength
+ " ms, current_rtf: "
+ current_rtf);
}
});
}
/**
* Stream output speech synthesis using Flowable features (non-streaming input)
*
* @param text Text to be synthesized
* @return The output event stream, including real-time audio and timestamps
*/
public Flowable callAsFlowable(String text)
throws ApiException, NoApiKeyException {
startStreamTimeStamp = System.currentTimeMillis();
recvAudioLength = 0;
return duplexApi
.duplexCall(
StreamInputTtsParamWithStream.fromStreamInputTtsParam(
this.parameters,
Flowable.create(
emitter -> {
new Thread(
() -> {
emitter.onNext(text);
emitter.onComplete();
})
.start();
},
BackpressureStrategy.BUFFER)))
.map(SpeechSynthesisResult::fromDashScopeResult)
.doOnNext(
result -> {
if (result.getAudioFrame() != null) {
if (recvAudioLength == 0) {
firstPackageTimeStamp = System.currentTimeMillis();
log.debug(
"[TtsV2] first package delay: "
+ (System.currentTimeMillis() - startStreamTimeStamp)
+ " ms");
}
recvAudioLength +=
(double) result.getAudioFrame().capacity()
/ ((double) (2 * parameters.getFormat().getSampleRate()) / 1000);
long current = System.currentTimeMillis();
double current_rtf = (current - firstPackageTimeStamp) / recvAudioLength;
log.debug(
"[TtsV2] Recv Audio Binary: "
+ result.getAudioFrame().capacity()
+ " bytes, total audio "
+ recvAudioLength
+ " ms, current_rtf: "
+ current_rtf);
}
});
}
/**
* Start voice transcription: Establish a connection with the server, send a voice transcription
* request, and synchronously receive confirmation from the server.
*/
private void startStream() {
startStreamTimeStamp = System.currentTimeMillis();
recvAudioLength = 0;
this.canceled.set(false);
if (this.callback == null) {
throw new ApiException(
new InputRequiredException("Parameter invalid: ResultCallback is null"));
}
// 新的session开始,重置所有buffer
lastRequestId.set(null);
outputStream = new ByteArrayOutputStream();
audioData = null;
// timestamps.clear();
WritableByteChannel channel = Channels.newChannel(outputStream);
Flowable textFrames =
Flowable.create(
emitter -> {
synchronized (SpeechSynthesizer.this) {
if (cmdBuffer.size() > 0) {
for (SpeechSynthesizer.AsyncCmdBuffer buffer : cmdBuffer) {
if (buffer.isStop) {
emitter.onComplete();
return;
} else {
emitter.onNext(buffer.text);
}
}
cmdBuffer.clear();
}
log.info("set textEmitter");
textEmitter = emitter;
}
},
BackpressureStrategy.BUFFER);
synchronized (this) {
state = SpeechSynthesisState.TTS_STARTED;
cmdBuffer.clear();
}
stopLatch = new AtomicReference<>(new CountDownLatch(1));
try {
duplexApi.duplexCall(
SpeechSynthesizer.StreamInputTtsParamWithStream.fromStreamInputTtsParam(
this.parameters, textFrames),
new ResultCallback() {
// private Sentence lastSentence = null;
@Override
public void onEvent(DashScopeResult message) {
if (canceled.get()) {
return;
}
SpeechSynthesisResult speechSynthesisResult =
SpeechSynthesisResult.fromDashScopeResult(message);
try {
if (lastRequestId.get() == null) {
lastRequestId.set(speechSynthesisResult.getRequestId());
}
/*
if (speechSynthesisResult.getTimestamp() != null && !async_call) {
Sentence sentence = speechSynthesisResult.getTimestamp();
if (lastSentence == null) {
lastSentence = sentence;
if (sentence.getEndTime() != 0) {
timestamps.add(sentence);
}
} else {
if (!lastSentence.equals(sentence) && sentence.getEndTime() != 0) {
lastSentence = sentence;
timestamps.add(sentence);
}
}
}
*/
if (speechSynthesisResult.getAudioFrame() != null) {
if (recvAudioLength == 0) {
firstPackageTimeStamp = System.currentTimeMillis();
log.debug(
"[TtsV2] first package delay: "
+ (System.currentTimeMillis() - startStreamTimeStamp)
+ " ms");
}
recvAudioLength +=
(double) speechSynthesisResult.getAudioFrame().capacity()
/ ((double) (2 * parameters.getFormat().getSampleRate()) / 1000);
long current = System.currentTimeMillis();
double current_rtf = (current - firstPackageTimeStamp) / recvAudioLength;
log.debug(
"[TtsV2] Recv Audio Binary: "
+ speechSynthesisResult.getAudioFrame().capacity()
+ " bytes, total audio "
+ recvAudioLength
+ " ms, current_rtf: "
+ current_rtf);
if (!async_call) {
try {
channel.write(speechSynthesisResult.getAudioFrame());
} catch (IOException e) {
log.error(
"Failed to write audio: {}", speechSynthesisResult.getAudioFrame(), e);
}
}
}
} catch (Exception e) {
log.error("Failed to parse response: {}", message, e);
callback.onError(e);
}
callback.onEvent(speechSynthesisResult);
}
@Override
public void onComplete() {
log.debug("[TtsV2] onComplete");
if (canceled.get()) {
return;
}
synchronized (SpeechSynthesizer.this) {
state = SpeechSynthesisState.IDLE;
}
audioData = ByteBuffer.wrap(outputStream.toByteArray());
try {
outputStream.close();
} catch (IOException e) {
log.error("Failed to close channel: {}", e);
}
callback.onComplete();
if (stopLatch.get() != null) {
stopLatch.get().countDown();
}
}
@Override
public void onError(Exception e) {
if (canceled.get()) {
return;
}
synchronized (SpeechSynthesizer.this) {
state = SpeechSynthesisState.IDLE;
}
ApiException apiException = new ApiException(e);
apiException.setStackTrace(e.getStackTrace());
callback.onError(apiException);
if (stopLatch.get() != null) {
stopLatch.get().countDown();
}
}
});
} catch (NoApiKeyException e) {
ApiException apiException = new ApiException(e);
apiException.setStackTrace(e.getStackTrace());
callback.onError(apiException);
if (stopLatch.get() != null) {
stopLatch.get().countDown();
}
}
}
/**
* Send text in a streaming manner
*
* @param text utf-8 encoded text
*/
private void submitText(String text) {
if (text == "") {
throw new ApiException(new InputRequiredException("Parameter invalid: text is null"));
}
synchronized (this) {
if (state != SpeechSynthesisState.TTS_STARTED) {
throw new ApiException(
new InputRequiredException(
"State invalid: expect stream input tts state is started but " + state.getValue()));
}
if (textEmitter == null) {
log.debug("submitText to new emitter: " + text);
cmdBuffer.add(AsyncCmdBuffer.builder().text(text).build());
} else {
log.debug("submitText to emitter: " + text);
textEmitter.onNext(text);
}
}
}
/**
* Synchronously stop the streaming input speech synthesis task. Wait for all remaining
* synthesized audio before returning
*
* @param completeTimeoutMillis The timeout period for await. If the timeout is set to a value
* greater than zero, it will wait for the corresponding number of milliseconds; otherwise, it
* will wait indefinitely. Throws TimeoutError exception if it times out.
*/
public void streamingComplete(long completeTimeoutMillis) throws RuntimeException {
log.info("streamingComplete with timeout: " + completeTimeoutMillis);
synchronized (this) {
if (state != SpeechSynthesisState.TTS_STARTED) {
throw new ApiException(
new RuntimeException(
"State invalid: expect stream input tts state is started but " + state.getValue()));
}
if (textEmitter == null) {
log.info("adding stop to new emitter");
cmdBuffer.add(AsyncCmdBuffer.builder().isStop(true).build());
} else {
log.info("adding stop to emitter");
textEmitter.onComplete();
}
}
if (stopLatch.get() != null) {
try {
if (completeTimeoutMillis > 0) {
log.info("start waiting for stopLatch");
if (!stopLatch.get().await(completeTimeoutMillis, TimeUnit.MILLISECONDS)) {
throw new RuntimeException("TimeoutError: waiting for streaming complete");
}
} else {
log.info("start waiting for stopLatch");
stopLatch.get().await();
}
log.info("stopLatch is done");
} catch (InterruptedException ignored) {
log.error("Interrupted while waiting for streaming complete");
}
}
}
/**
* Synchronously stop the streaming input speech synthesis task. Wait for all remaining
* synthesized audio before returning. If it does not complete within 600 seconds, a timeout
* occurs and a TimeoutError exception is thrown.
*/
public void streamingComplete() throws RuntimeException {
streamingComplete(600000);
}
/**
* Asynchronously stop the streaming input speech synthesis task, returns immediately. You need to
* listen and handle the STREAM_INPUT_TTS_EVENT_SYNTHESIS_COMPLETE event in the on_event callback.
* Do not destroy the object and callback before this event.
*/
public void asyncStreamingComplete() {
synchronized (this) {
if (state != SpeechSynthesisState.TTS_STARTED) {
throw new ApiException(
new RuntimeException(
"State invalid: expect stream input tts state is started but " + state.getValue()));
}
if (textEmitter == null) {
cmdBuffer.add(AsyncCmdBuffer.builder().isStop(true).build());
} else {
textEmitter.onComplete();
}
}
}
/**
* Immediately terminate the streaming input speech synthesis task and discard any remaining audio
* that is not yet delivered.
*/
public void streamingCancel() {
canceled.set(true);
synchronized (this) {
if (state != SpeechSynthesisState.TTS_STARTED) {
return;
}
if (textEmitter == null) {
cmdBuffer.add(AsyncCmdBuffer.builder().isStop(true).build());
} else {
textEmitter.onComplete();
}
}
}
/**
* Streaming input mode: You can call the stream_call function multiple times to send text. A
* session will be created on the first call. The session ends after calling streaming_complete.
*
* @param text utf-8 encoded text
*/
public void streamingCall(String text) {
if (isFirst) {
isFirst = false;
this.startStream();
}
this.submitText(text);
}
/**
* Speech synthesis If a callback is set, the audio will be returned in real-time through the
* on_event interface Otherwise, this function blocks until all audio is received and then returns
* the complete audio data.
*
* @param text utf-8 encoded text
* @param timeoutMillis timeout for waiting audio data. If the timeout is set to a value greater
* than zero, it will wait for the corresponding number of milliseconds; otherwise, it will
* wait indefinitely.
* @return If a callback is not set during initialization, the complete audio is returned as the
* function's return value. Otherwise, the return value is null.
*/
public ByteBuffer call(String text, long timeoutMillis) throws RuntimeException {
if (this.callback == null) {
this.callback =
new ResultCallback() {
@Override
public void onEvent(SpeechSynthesisResult message) {}
@Override
public void onComplete() {}
@Override
public void onError(Exception e) {}
};
}
this.startStream();
this.submitText(text);
if (this.async_call) {
this.asyncStreamingComplete();
return null;
} else {
this.streamingComplete(timeoutMillis);
return audioData;
}
}
/**
* Speech synthesis If a callback is set, the audio will be returned in real-time through the
* on_event interface Otherwise, this function blocks until all audio is received and then returns
* the complete audio data.
*
* @param text utf-8 encoded text
* @return If a callback is not set during initialization, the complete audio is returned as the
* function's return value. Otherwise, the return value is null.
*/
public ByteBuffer call(String text) throws RuntimeException {
return call(text, 0);
}
@SuperBuilder
private static class AsyncCmdBuffer {
@Builder.Default private boolean isStop = false;
private String text;
}
@SuperBuilder
private static class StreamInputTtsParamWithStream extends SpeechSynthesisParam {
@NonNull private Flowable textStream;
public static StreamInputTtsParamWithStream fromStreamInputTtsParam(
SpeechSynthesisParam param, Flowable textStream) {
return StreamInputTtsParamWithStream.builder()
.headers(param.getHeaders())
.parameters(param.getParameters())
.format(param.getFormat())
.textStream(textStream)
.model(param.getModel())
.voice(param.getVoice())
.apiKey(param.getApiKey())
.build();
}
@Override
public Flowable