
com.baidu.aip.speech.PromptSoundSpeech Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of java-sdk Show documentation
Show all versions of java-sdk Show documentation
The AIP SDK for Java provides Java APIs for all of AI APIs.
package com.baidu.aip.speech;
import org.bytedeco.ffmpeg.global.avcodec;
import org.bytedeco.javacv.FFmpegFrameGrabber;
import org.bytedeco.javacv.FFmpegFrameGrabber.Exception;
import org.bytedeco.javacv.FFmpegFrameRecorder;
import org.bytedeco.javacv.Frame;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* 可以自动在生成的音频前插入音效,可以通过配置文件自定义音效的种类和位置
*/
public class PromptSoundSpeech extends AipSpeech {
private Map audioConfigs;
/**
* 初始化YmmSoundSpeech对象
*
* @param appId 应用程序ID
* @param apiKey 应用程序秘钥
* @param secretKey 应用程序秘钥
* @param configPath 合成音效的配置路径
*/
public PromptSoundSpeech(String appId, String apiKey, String secretKey, String configPath) {
super(appId, apiKey, secretKey);
// 初始化配置文件
initializeConfig(configPath);
}
/**
* 初始化配置
*
* @param configPath 配置文件的路径
* @throws IllegalArgumentException 如果配置文件不存在、配置条目缺少必要字段、音频文件不存在或音频文件为空,将抛出此异常
* @throws RuntimeException 如果在读取文件或解析JSON时发生异常,将抛出此异常
*/
private void initializeConfig(String configPath) throws IllegalArgumentException, RuntimeException {
try {
String jsonContent;
Map newConfig = new HashMap<>();
if (isJsonArray(configPath)) {
// 如果传入的是JSON字符串数组,则直接使用它
jsonContent = configPath;
} else if (configPath.startsWith("http://") || configPath.startsWith("https://")) {
jsonContent = new String(downloadFromUrl(configPath));
} else {
File configFile = new File(configPath);
if (!configFile.exists()) {
throw new IllegalArgumentException("config file not exist: " + configPath);
}
jsonContent = new String(Files.readAllBytes(configFile.toPath()), StandardCharsets.UTF_8);
}
JSONArray jsonArray = new JSONArray(jsonContent);
//Map configMap = new HashMap<>();
for (int i = 0; i < jsonArray.length(); i++) {
JSONObject obj = jsonArray.getJSONObject(i);
if (!obj.has("audioType") || !obj.has("audioPath")) {
throw new IllegalArgumentException("config file item not exist");
}
int audioType = obj.getInt("audioType");
String audioPath = obj.getString("audioPath");
byte[] audioData = loadAudioData(audioPath);
// 获取音频格式(后缀)
String audioFormat = getAudioFormat(audioPath);
newConfig.put(audioType, new AudioConfig(audioType, audioPath, audioData, audioFormat));
}
// 成功解析后,清理旧配置并设置新配置
if (this.audioConfigs != null) {
audioConfigs.values().forEach(AudioConfig::clearAudioData);
}
this.audioConfigs = newConfig;
} catch (IOException | JSONException e) {
throw new RuntimeException("init config failed: " + e.getMessage(), e);
}
}
/**
* 根据文本、语言、音频类型和选项进行语音合成。
* 参考文档:https://ai.baidu.com/ai-doc/SPEECH/Jlbxdezuf
*
* @return 返回合成后的 TtsResponse 列表。
*/
public List synthesis(
String text, String lang, int ctp, List audioTypes, HashMap options) {
TtsResponse rawSynthesisResult = super.synthesis(text, lang, ctp, options);
// TtsResponse rawSynthesisResult = mock();
// 未成功合成音频数据
if (rawSynthesisResult == null || rawSynthesisResult.getData() == null) {
return Collections.singletonList(rawSynthesisResult);
}
// 拼接音频
try {
return mergeAudios(rawSynthesisResult, audioTypes);
} catch (FFmpegFrameRecorder.Exception e) {
// 如果拼接音频出现错误,只返回原始合成的音频
LOGGER.error("merge prompt sound error occurred, skip merge: ", e);
return Collections.singletonList(rawSynthesisResult);
}
}
private byte[] loadAudioData(String path) throws IOException {
if (path.startsWith("http://") || path.startsWith("https://")) {
return downloadFromUrl(path);
} else {
File audioFile = new File(path);
if (!audioFile.exists()) {
throw new IllegalArgumentException("audio file not exist: " + path);
}
return Files.readAllBytes(audioFile.toPath());
}
}
// 检查传入的字符串是否为有效的JSON
private boolean isJsonArray(String str) {
if (str == null || str.isEmpty()) {
return false;
}
try {
new JSONArray(str);
return true;
} catch (JSONException e) {
return false;
}
}
private byte[] downloadFromUrl(String urlString) throws IOException {
URL url = new URL(urlString);
try (InputStream is = url.openStream()) {
ByteArrayOutputStream buffer = new ByteArrayOutputStream();
byte[] data = new byte[8192];
int bytesRead;
while ((bytesRead = is.read(data, 0, data.length)) != -1) {
buffer.write(data, 0, bytesRead);
}
return buffer.toByteArray();
}
}
/**
* 音频配置类
*/
public static class AudioConfig {
private int audioType;
private String audioPath;
private byte[] audioData;
private String audioFormat; // 音频格式
public AudioConfig() {
}
public AudioConfig(int audioType, String audioPath, byte[] audioData, String audioFormat) {
this.audioType = audioType;
this.audioPath = audioPath;
this.audioData = audioData;
this.audioFormat = audioFormat;
}
public int getAudioType() {
return audioType;
}
public String getAudioPath() {
return audioPath;
}
public byte[] getAudioData() {
return audioData;
}
public String getAudioFormat() {
return audioFormat;
}
// 释放音频数据
public void clearAudioData() {
this.audioData = null;
}
}
/**
* 从音频路径中提取音频格式(后缀)
*
* @param audioPath 音频路径
* @return 音频格式(如"mp3"、"wav"等),默认为"mp3"
*/
private String getAudioFormat(String audioPath) {
if (audioPath == null || audioPath.isEmpty()) {
return "mp3"; // 默认格式
}
// 处理URL中的查询参数
String path = audioPath;
int queryIndex = path.indexOf('?');
if (queryIndex != -1) {
path = path.substring(0, queryIndex);
}
// 获取文件扩展名
int dotIndex = path.lastIndexOf('.');
if (dotIndex != -1 && dotIndex < path.length() - 1) {
return path.substring(dotIndex + 1).toLowerCase();
}
return "mp3"; // 默认格式
}
/**
* 将多个音频合并到原始响应中
*
* @param originalResponse 原始响应对象,包含原始音频数据
* @param audioTypes 需要合并的音频类型列表
* @return 包含合并后音频数据的响应列表
*/
private List mergeAudios(TtsResponse originalResponse, List audioTypes)
throws FFmpegFrameRecorder.Exception {
List mergedResponses = new ArrayList<>();
for (int audioType : audioTypes) {
byte[] resultAudio = originalResponse.getData();
if (audioConfigs.containsKey(audioType)) {
resultAudio =
doMergeAudio(
originalResponse.getData(),
audioConfigs.get(audioType).getAudioData(),
audioConfigs.get(audioType).getAudioFormat());
}
TtsResponse mergedResponse = new TtsResponse();
mergedResponse.setData(resultAudio);
mergedResponses.add(mergedResponse);
}
return mergedResponses;
}
/**
* 合并音频文件
*
* @param originAudio 原始音频数据
* @param soundAudio 音效音频数据
* @param soundFormat 音效音频格式
* @return 合并后的音频数据
*/
private byte[] doMergeAudio(byte[] originAudio, byte[] soundAudio, String soundFormat)
throws FFmpegFrameRecorder.Exception {
FFmpegFrameGrabber originGrabber = null;
FFmpegFrameGrabber soundGrabber = null;
FFmpegFrameRecorder recorder = null;
ByteArrayOutputStream outputStream = null;
try {
// 原始音频
originGrabber = new FFmpegFrameGrabber(new ByteArrayInputStream(originAudio));
originGrabber.start();
// 音效
soundGrabber = new FFmpegFrameGrabber(new ByteArrayInputStream(soundAudio));
soundGrabber.start();
// 获取音频参数
int channels = Math.max(originGrabber.getAudioChannels(), soundGrabber.getAudioChannels());
int sampleRate = Math.max(originGrabber.getSampleRate(), soundGrabber.getSampleRate());
// 合成后的音频
outputStream = new ByteArrayOutputStream();
recorder = new FFmpegFrameRecorder(outputStream, 2);
// 设置编码器
switch (soundFormat.toLowerCase()) {
case "mp3":
recorder.setAudioCodec(avcodec.AV_CODEC_ID_MP3);
break;
case "wav":
recorder.setAudioCodec(avcodec.AV_CODEC_ID_PCM_S16LE);
break;
case "aac":
recorder.setAudioCodec(avcodec.AV_CODEC_ID_AAC);
break;
case "flac":
recorder.setAudioCodec(avcodec.AV_CODEC_ID_FLAC);
break;
// 可添加更多格式支持
default:
recorder.setAudioCodec(avcodec.AV_CODEC_ID_MP3); // 默认使用MP3
break;
}
recorder.setFormat(soundFormat.toLowerCase());
recorder.setAudioChannels(channels);
recorder.setSampleRate(sampleRate);
recorder.setAudioBitrate(64000);
recorder.start();
// 合成音频
// 先拼接音效
Frame frame;
while ((frame = soundGrabber.grabSamples()) != null) {
if (frame.samples != null) {
recorder.recordSamples(frame.sampleRate, frame.audioChannels, frame.samples);
}
}
// 再拼接原始音频
while ((frame = originGrabber.grabSamples()) != null) {
if (frame.samples != null) {
recorder.recordSamples(frame.sampleRate, frame.audioChannels, frame.samples);
}
}
return outputStream.toByteArray();
} catch (Exception e) {
e.printStackTrace();
return null;
} finally {
// 释放资源
closeResource(recorder);
closeResource(originGrabber);
closeResource(soundGrabber);
if (outputStream != null) {
try {
outputStream.close();
} catch (IOException ignore) {
}
}
}
}
// 统一资源关闭方法
private void closeResource(AutoCloseable resource) {
if (resource != null) {
try {
if (resource instanceof FFmpegFrameGrabber) {
FFmpegFrameGrabber grabber = (FFmpegFrameGrabber) resource;
try {
grabber.stop(); // 先停止
} catch (java.lang.Exception e) {
// 忽略 stop 异常
}
grabber.close(); // 再关闭
grabber.release(); // 强制释放 native 资源
} else if (resource instanceof FFmpegFrameRecorder) {
FFmpegFrameRecorder recorder = (FFmpegFrameRecorder) resource;
try {
recorder.stop();
} catch (java.lang.Exception e) {
// 忽略 stop 异常
}
recorder.close();
recorder.release(); // 强制释放 native 资源
} else {
resource.close();
}
} catch (java.lang.Exception e) {
LOGGER.warn("Error closing resource: {}", e.getMessage());
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy