com.baidu.aip.speech.PromptSoundSpeech Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of java-sdk Show documentation
The AIP SDK for Java provides Java APIs for all of AI APIs.
There is a newer version: 4.8.0
package com.baidu.aip.speech;

import org.bytedeco.ffmpeg.global.avcodec;
import org.bytedeco.javacv.FFmpegFrameGrabber;
import org.bytedeco.javacv.FFmpegFrameGrabber.Exception;
import org.bytedeco.javacv.FFmpegFrameRecorder;
import org.bytedeco.javacv.Frame;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * 可以自动在生成的音频前插入音效，可以通过配置文件自定义音效的种类和位置
 */
public class PromptSoundSpeech extends AipSpeech {

    private Map audioConfigs;

    /**
     * 初始化YmmSoundSpeech对象
     *
     * @param appId      应用程序ID
     * @param apiKey     应用程序秘钥
     * @param secretKey  应用程序秘钥
     * @param configPath 合成音效的配置路径
     */
    public PromptSoundSpeech(String appId, String apiKey, String secretKey, String configPath) {
        super(appId, apiKey, secretKey);

        // 初始化配置文件
        initializeConfig(configPath);
    }

    /**
     * 初始化配置
     *
     * @param configPath 配置文件的路径
     * @throws IllegalArgumentException 如果配置文件不存在、配置条目缺少必要字段、音频文件不存在或音频文件为空，将抛出此异常
     * @throws RuntimeException         如果在读取文件或解析JSON时发生异常，将抛出此异常
     */
    private void initializeConfig(String configPath) throws IllegalArgumentException, RuntimeException {

        try {
            String jsonContent;
            Map newConfig = new HashMap<>();
            if (isJsonArray(configPath)) {
                // 如果传入的是JSON字符串数组，则直接使用它
                jsonContent = configPath;
            } else if (configPath.startsWith("http://") || configPath.startsWith("https://")) {
                jsonContent = new String(downloadFromUrl(configPath));
            } else {
                File configFile = new File(configPath);
                if (!configFile.exists()) {
                    throw new IllegalArgumentException("config file not exist: " + configPath);
                }
                jsonContent = new String(Files.readAllBytes(configFile.toPath()), StandardCharsets.UTF_8);
            }

            JSONArray jsonArray = new JSONArray(jsonContent);
            //Map configMap = new HashMap<>();

            for (int i = 0; i < jsonArray.length(); i++) {
                JSONObject obj = jsonArray.getJSONObject(i);
                if (!obj.has("audioType") || !obj.has("audioPath")) {
                    throw new IllegalArgumentException("config file item not exist");
                }

                int audioType = obj.getInt("audioType");
                String audioPath = obj.getString("audioPath");
                byte[] audioData = loadAudioData(audioPath);
                // 获取音频格式（后缀）
                String audioFormat = getAudioFormat(audioPath);
                newConfig.put(audioType, new AudioConfig(audioType, audioPath, audioData, audioFormat));
            }
            // 成功解析后，清理旧配置并设置新配置
            if (this.audioConfigs != null) {
                audioConfigs.values().forEach(AudioConfig::clearAudioData);
            }

            this.audioConfigs = newConfig;
        } catch (IOException | JSONException e) {
            throw new RuntimeException("init config failed: " + e.getMessage(), e);
        }
    }

    /**
     * 根据文本、语言、音频类型和选项进行语音合成。
     * 参考文档：https://ai.baidu.com/ai-doc/SPEECH/Jlbxdezuf
     *
     * @return 返回合成后的 TtsResponse 列表。
     */
    public List synthesis(
            String text, String lang, int ctp, List audioTypes, HashMap options) {
        TtsResponse rawSynthesisResult = super.synthesis(text, lang, ctp, options);
//         TtsResponse rawSynthesisResult = mock();
        // 未成功合成音频数据
        if (rawSynthesisResult == null || rawSynthesisResult.getData() == null) {
            return Collections.singletonList(rawSynthesisResult);
        }

        // 拼接音频
        try {
            return mergeAudios(rawSynthesisResult, audioTypes);
        } catch (FFmpegFrameRecorder.Exception e) {
            // 如果拼接音频出现错误，只返回原始合成的音频
            LOGGER.error("merge prompt sound error occurred, skip merge: ", e);
            return Collections.singletonList(rawSynthesisResult);
        }
    }

    private byte[] loadAudioData(String path) throws IOException {
        if (path.startsWith("http://") || path.startsWith("https://")) {
            return downloadFromUrl(path);
        } else {
            File audioFile = new File(path);
            if (!audioFile.exists()) {
                throw new IllegalArgumentException("audio file not exist: " + path);
            }
            return Files.readAllBytes(audioFile.toPath());
        }
    }

    // 检查传入的字符串是否为有效的JSON
    private boolean isJsonArray(String str) {
        if (str == null || str.isEmpty()) {
            return false;
        }
        try {
            new JSONArray(str);
            return true;
        } catch (JSONException e) {
            return false;
        }
    }

    private byte[] downloadFromUrl(String urlString) throws IOException {
        URL url = new URL(urlString);
        try (InputStream is = url.openStream()) {
            ByteArrayOutputStream buffer = new ByteArrayOutputStream();
            byte[] data = new byte[8192];
            int bytesRead;
            while ((bytesRead = is.read(data, 0, data.length)) != -1) {
                buffer.write(data, 0, bytesRead);
            }
            return buffer.toByteArray();
        }
    }

    /**
     * 音频配置类
     */
    public static class AudioConfig {
        private int audioType;
        private String audioPath;
        private byte[] audioData;
        private String audioFormat; // 音频格式

        public AudioConfig() {
        }

        public AudioConfig(int audioType, String audioPath, byte[] audioData, String audioFormat) {
            this.audioType = audioType;
            this.audioPath = audioPath;
            this.audioData = audioData;
            this.audioFormat = audioFormat;
        }

        public int getAudioType() {
            return audioType;
        }

        public String getAudioPath() {
            return audioPath;
        }

        public byte[] getAudioData() {
            return audioData;
        }

        public String getAudioFormat() {
            return audioFormat;
        }

        // 释放音频数据
        public void clearAudioData() {
            this.audioData = null;
        }
    }

    /**
     * 从音频路径中提取音频格式（后缀）
     *
     * @param audioPath 音频路径
     * @return 音频格式（如"mp3"、"wav"等），默认为"mp3"
     */
    private String getAudioFormat(String audioPath) {
        if (audioPath == null || audioPath.isEmpty()) {
            return "mp3"; // 默认格式
        }

        // 处理URL中的查询参数
        String path = audioPath;
        int queryIndex = path.indexOf('?');
        if (queryIndex != -1) {
            path = path.substring(0, queryIndex);
        }

        // 获取文件扩展名
        int dotIndex = path.lastIndexOf('.');
        if (dotIndex != -1 && dotIndex < path.length() - 1) {
            return path.substring(dotIndex + 1).toLowerCase();
        }

        return "mp3"; // 默认格式
    }

    /**
     * 将多个音频合并到原始响应中
     *
     * @param originalResponse 原始响应对象，包含原始音频数据
     * @param audioTypes       需要合并的音频类型列表
     * @return 包含合并后音频数据的响应列表
     */
    private List mergeAudios(TtsResponse originalResponse, List audioTypes)
            throws FFmpegFrameRecorder.Exception {
        List mergedResponses = new ArrayList<>();

        for (int audioType : audioTypes) {
            byte[] resultAudio = originalResponse.getData();
            if (audioConfigs.containsKey(audioType)) {
                resultAudio =
                        doMergeAudio(
                                originalResponse.getData(),
                                audioConfigs.get(audioType).getAudioData(),
                                audioConfigs.get(audioType).getAudioFormat());
            }

            TtsResponse mergedResponse = new TtsResponse();
            mergedResponse.setData(resultAudio);
            mergedResponses.add(mergedResponse);
        }

        return mergedResponses;
    }

    /**
     * 合并音频文件
     *
     * @param originAudio 原始音频数据
     * @param soundAudio  音效音频数据
     * @param soundFormat 音效音频格式
     * @return 合并后的音频数据
     */
    private byte[] doMergeAudio(byte[] originAudio, byte[] soundAudio, String soundFormat)
            throws FFmpegFrameRecorder.Exception {
        FFmpegFrameGrabber originGrabber = null;
        FFmpegFrameGrabber soundGrabber = null;
        FFmpegFrameRecorder recorder = null;
        ByteArrayOutputStream outputStream = null;
        try {
            // 原始音频
            originGrabber = new FFmpegFrameGrabber(new ByteArrayInputStream(originAudio));
            originGrabber.start();

            // 音效
            soundGrabber = new FFmpegFrameGrabber(new ByteArrayInputStream(soundAudio));
            soundGrabber.start();

            // 获取音频参数
            int channels = Math.max(originGrabber.getAudioChannels(), soundGrabber.getAudioChannels());
            int sampleRate = Math.max(originGrabber.getSampleRate(), soundGrabber.getSampleRate());

            // 合成后的音频
            outputStream = new ByteArrayOutputStream();
            recorder = new FFmpegFrameRecorder(outputStream, 2);
            // 设置编码器
            switch (soundFormat.toLowerCase()) {
                case "mp3":
                    recorder.setAudioCodec(avcodec.AV_CODEC_ID_MP3);
                    break;
                case "wav":
                    recorder.setAudioCodec(avcodec.AV_CODEC_ID_PCM_S16LE);
                    break;
                case "aac":
                    recorder.setAudioCodec(avcodec.AV_CODEC_ID_AAC);
                    break;
                case "flac":
                    recorder.setAudioCodec(avcodec.AV_CODEC_ID_FLAC);
                    break;
                // 可添加更多格式支持
                default:
                    recorder.setAudioCodec(avcodec.AV_CODEC_ID_MP3); // 默认使用MP3
                    break;
            }
            recorder.setFormat(soundFormat.toLowerCase());
            recorder.setAudioChannels(channels);
            recorder.setSampleRate(sampleRate);
            recorder.setAudioBitrate(64000);
            recorder.start();

            // 合成音频
            // 先拼接音效
            Frame frame;
            while ((frame = soundGrabber.grabSamples()) != null) {
                if (frame.samples != null) {
                    recorder.recordSamples(frame.sampleRate, frame.audioChannels, frame.samples);
                }
            }

            // 再拼接原始音频
            while ((frame = originGrabber.grabSamples()) != null) {
                if (frame.samples != null) {
                    recorder.recordSamples(frame.sampleRate, frame.audioChannels, frame.samples);
                }
            }
            return outputStream.toByteArray();
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        } finally {
            // 释放资源
            closeResource(recorder);
            closeResource(originGrabber);
            closeResource(soundGrabber);
            if (outputStream != null) {
                try {
                    outputStream.close();
                } catch (IOException ignore) {
                }
            }
        }
    }

    // 统一资源关闭方法
    private void closeResource(AutoCloseable resource) {
        if (resource != null) {
            try {
                if (resource instanceof FFmpegFrameGrabber) {
                    FFmpegFrameGrabber grabber = (FFmpegFrameGrabber) resource;
                    try {
                        grabber.stop();  // 先停止
                    } catch (java.lang.Exception e) {
                        // 忽略 stop 异常
                    }
                    grabber.close();     // 再关闭
                    grabber.release();   // 强制释放 native 资源
                } else if (resource instanceof FFmpegFrameRecorder) {
                    FFmpegFrameRecorder recorder = (FFmpegFrameRecorder) resource;
                    try {
                        recorder.stop();
                    } catch (java.lang.Exception e) {
                        // 忽略 stop 异常
                    }
                    recorder.close();
                    recorder.release(); // 强制释放 native 资源
                } else {
                    resource.close();
                }
            } catch (java.lang.Exception e) {
                LOGGER.warn("Error closing resource: {}", e.getMessage());
            }
        }
    }
}