edu.cmu.sphinx.demo.speakerid.SpeakerIdentificationDemo Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of sphinx4-samples Show documentation
Sphinx4 demo applications
The newest version!
package edu.cmu.sphinx.demo.speakerid;

import java.net.URL;
import java.util.ArrayList;

import edu.cmu.sphinx.api.Configuration;
import edu.cmu.sphinx.api.SpeechResult;
import edu.cmu.sphinx.api.StreamSpeechRecognizer;
import edu.cmu.sphinx.decoder.adaptation.Stats;
import edu.cmu.sphinx.decoder.adaptation.Transform;
import edu.cmu.sphinx.speakerid.Segment;
import edu.cmu.sphinx.speakerid.SpeakerCluster;
import edu.cmu.sphinx.speakerid.SpeakerIdentification;
import edu.cmu.sphinx.util.TimeFrame;

public class SpeakerIdentificationDemo {

    /**
     * Returns string version of the given time in milliseconds
     * 
     * @param milliseconds time in milliseconds
     * @return time in format mm:ss
     */
    public static String time(int milliseconds) {
        return (milliseconds / 60000) + ":"
                + (Math.round((double) (milliseconds % 60000) / 1000));
    }

    /**
     * 
     * @param speakers
     *            An array of clusters for which it is needed to be printed the
     *            speakers intervals
     * @param fileName
     *            THe name of file we are processing
     */
    public static void printSpeakerIntervals(
            ArrayList speakers, String fileName) {
        int idx = 0;
        for (SpeakerCluster spk : speakers) {
            idx++;
            ArrayList segments = spk.getSpeakerIntervals();
            for (Segment seg : segments)
                System.out.println(fileName + " " + " "
                        + time(seg.getStartTime()) + " "
                        + time(seg.getLength()) + " Speaker" + idx);
        }
    }

    /**
     * @param speakers
     *            An array of clusters for which it is needed to get the
     *            speakers intervals for decoding with per-speaker adaptation
     *            with diarization.
     * @param url
     *            Url for the audio
     * @throws Exception if something went wrong
     */
    public static void speakerAdaptiveDecoding(ArrayList speakers,
            URL url) throws Exception {

        Configuration configuration = new Configuration();

        // Load model from the jar
        configuration
                .setAcousticModelPath("resource:/edu/cmu/sphinx/models/en-us/en-us");
        configuration
                .setDictionaryPath("resource:/edu/cmu/sphinx/models/en-us/cmudict-en-us.dict");
        configuration
                .setLanguageModelPath("resource:/edu/cmu/sphinx/models/en-us/en-us.lm.bin");

        StreamSpeechRecognizer recognizer = new StreamSpeechRecognizer(
                configuration);

        TimeFrame t;
        SpeechResult result;

        for (SpeakerCluster spk : speakers) {
            Stats stats = recognizer.createStats(1);
            ArrayList segments = spk.getSpeakerIntervals();

            for (Segment s : segments) {
                long startTime = s.getStartTime();
                long endTime = s.getStartTime() + s.getLength();
                t = new TimeFrame(startTime, endTime);

                recognizer.startRecognition(url.openStream(), t);
                while ((result = recognizer.getResult()) != null) {
                    stats.collect(result);
                }
                recognizer.stopRecognition();
            }

            Transform profile;
            // Create the Transformation
            profile = stats.createTransform();
            recognizer.setTransform(profile);

            for (Segment seg : segments) {
                long startTime = seg.getStartTime();
                long endTime = seg.getStartTime() + seg.getLength();
                t = new TimeFrame(startTime, endTime);

                // Decode again with updated SpeakerProfile
                recognizer.startRecognition(url.openStream(), t);
                while ((result = recognizer.getResult()) != null) {
                    System.out.format("Hypothesis: %s\n",
                            result.getHypothesis());
                }
                recognizer.stopRecognition();
            }
        }
    }

    public static void main(String[] args) throws Exception {
        SpeakerIdentification sd = new SpeakerIdentification();
        URL url = SpeakerIdentificationDemo.class.getResource("test.wav");
        ArrayList clusters = sd.cluster(url.openStream());

        printSpeakerIntervals(clusters, url.getPath());
        speakerAdaptiveDecoding(clusters, url);
    }
}