All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jitsi.utils.dsi.DominantSpeakerIdentification Maven / Gradle / Ivy

There is a newer version: 1.0-133-g6af1020
Show newest version
/*
 * Copyright @ 2015 Atlassian Pty Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.jitsi.utils.dsi;

import java.lang.ref.*;
import java.time.*;
import java.util.*;
import java.util.concurrent.*;

import org.jetbrains.annotations.*;
import org.jitsi.utils.concurrent.*;
import org.jitsi.utils.logging2.*;
import org.json.simple.*;

/**
 * Implements {@link ActiveSpeakerDetector} with inspiration from the paper
 * "Dominant Speaker Identification for Multipoint Videoconferencing"
 * by Ilana Volfin and Israel Cohen.
 *
 * @param  The type used for speaker identifiers.
 *
 * @author Lyubomir Marinov
 */
@SuppressWarnings("unused")
public class DominantSpeakerIdentification
    extends AbstractActiveSpeakerDetector
{
    /**
     * The threshold of the relevant speech activities in the immediate
     * time-interval in "global decision"/"Dominant speaker
     * selection" phase of the algorithm.
     */
    private static final double C1 = 3;

    /**
     * The threshold of the relevant speech activities in the medium
     * time-interval in "global decision"/"Dominant speaker
     * selection" phase of the algorithm.
     */
    private static final double C2 = 2;

    /**
     * The threshold of the relevant speech activities in the long
     * time-interval in "global decision"/"Dominant speaker
     * selection" phase of the algorithm.
     */
    private static final double C3 = 0;

    /**
     * The indicator which determines whether the
     * DominantSpeakerIdentification class and its instances are to
     * execute in debug mode.
     */
    private static final boolean DEBUG;

    /**
     * The interval in milliseconds of the activation of the identification of
     * the dominant speaker in a multipoint conference.
     */
    private static final long DECISION_INTERVAL = 300;

    /**
     * The interval of time in milliseconds of idle execution of
     * DecisionMaker after which the latter should cease to exist. The
     * interval does not have to be very long because the background threads
     * running the DecisionMakers are pooled anyway.
     */
    private static final long DECISION_MAKER_IDLE_TIMEOUT = 15 * 1000;

    /**
     * The interval of time without a call to {@link Speaker#levelChanged(int,long)}
     * after which DominantSpeakerIdentification assumes that there
     * will be no report of a Speaker's level within a certain
     * time-frame. The default value of 40 is chosen in order to allow
     * non-aggressive fading of the last received or measured level and to be
     * greater than the most common RTP packet durations in milliseconds i.e.
     * 20 and 30. 
     */
    private static final long LEVEL_IDLE_TIMEOUT = 40;

    /**
     * The Logger used by the DominantSpeakerIdentification
     * class and its instances to print debug information.
     */
    private static final Logger logger = new LoggerImpl(DominantSpeakerIdentification.class.getName());

    /**
     * The (total) number of long time-intervals used for speech activity score
     * evaluation at a specific time-frame.
     */
    private static final int LONG_COUNT = 1;

    /**
     * The threshold in terms of active medium-length blocks which is used
     * during the speech activity evaluation step for the long time-interval.
     */
    private static final int LONG_THRESHOLD = 4;

    /**
     * The maximum value of audio level supported by
     * DominantSpeakerIdentification.
     */
    private static final int MAX_LEVEL = 127;

    /**
     * The minimum value of audio level supported by
     * DominantSpeakerIdentification.
     */
    private static final int MIN_LEVEL = 0;

    /**
     * The number of (audio) levels received or measured for a Speaker
     * to be monitored in order to determine that the minimum level for the
     * Speaker has increased.
     */
    private static final int MIN_LEVEL_WINDOW_LENGTH
        = 15 /* seconds */ * 1000 /* milliseconds */
            / 20 /* milliseconds per level */;

    /**
     * The minimum value of speech activity score supported by
     * DominantSpeakerIdentification. The value must be positive
     * because (1) we are going to use it as the argument of a logarithmic
     * function and the latter is undefined for negative arguments and (2) we
     * will be dividing by the speech activity score.
     */
    private static final double MIN_SPEECH_ACTIVITY_SCORE = 0.0000000001D;

    /**
     * The threshold in terms of active sub-bands in a frame which is used
     * during the speech activity evaluation step for the medium length
     * time-interval.
     */
    private static final int MEDIUM_THRESHOLD = 7;

    /**
     * The (total) number of sub-bands in the frequency range evaluated for
     * immediate speech activity. The implementation of the class
     * DominantSpeakerIdentification does not really operate on the
     * representation of the signal in the frequency domain, it works with audio
     * levels derived from RFC 6465 "A Real-time Transport Protocol (RTP)
     * Header Extension for Mixer-to-Client Audio Level Indication". 
     */
    private static final int N1 = 13;

    /**
     * The length/size of a sub-band in the frequency range evaluated for
     * immediate speech activity. In the context of the implementation of the
     * class DominantSpeakerIdentification, it specifies the
     * length/size of a sub-unit of the audio level range defined by RFC 6465.
     */
    private static final int N1_SUBUNIT_LENGTH = (MAX_LEVEL - MIN_LEVEL + N1 - 1) / N1;

    /**
     * The number of frames (i.e. {@link Speaker#immediates} evaluated for
     * medium speech activity.
     */
    private static final int N2 = 5;

    /**
     * The number of medium-length blocks constituting a long time-interval.
     */
    private static final int N3 = 10;

    /**
     * The interval of time without a call to {@link Speaker#levelChanged(int,long)}
     * after which DominantSpeakerIdentification assumes that a
     * non-dominant Speaker is to be automatically removed from
     * {@link #speakers}.
     */
    private static final long SPEAKER_IDLE_TIMEOUT = 60 * 60 * 1000;

    /**
     * The default interval of silence before switching to "silence" (if silence detection is enabled).
     */
    private static final long DEFAULT_TIMEOUT_TO_SILENCE_INTERVAL = 3000;

    private static final ScheduledExecutorService DEFAULT_EXECUTOR
            = Executors.newScheduledThreadPool(1, new CustomizableThreadFactory("dsi", true));

    static
    {
        DEBUG = logger.isDebugEnabled();
    }

    /**
     * Computes the binomial coefficient indexed by n and r
     * i.e. the number of ways of picking r unordered outcomes from
     * n possibilities.
     *
     * @param n the number of possibilities to pick from
     * @param r the number unordered outcomes to pick from n
     * @return the binomial coefficient indexed by n and r
     * i.e. the number of ways of picking r unordered outcomes from
     * n possibilities
     */
    private static long binomialCoefficient(int n, int r)
    {
        int m = n - r; // r = Math.max(r, n - r);

        if (r < m)
        {
            r = m;
        }

        long t = 1;

        for (int i = n, j = 1; i > r; i--, j++)
        {
            t = t * i / j;
        }

        return t;
    }

    private static boolean computeBigs(
            byte[] littles,
            byte[] bigs,
            int threshold)
    {
        int bigLength = bigs.length;
        int littleLengthPerBig = littles.length / bigLength;
        boolean changed = false;

        for (int b = 0, l = 0; b < bigLength; b++)
        {
            byte sum = 0;

            for (int lEnd = l + littleLengthPerBig; l < lEnd; l++)
            {
                if (littles[l] > threshold)
                {
                    sum++;
                }
            }
            if (bigs[b] != sum)
            {
                bigs[b] = sum;
                changed = true;
            }
        }
        return changed;
    }

    private static double computeSpeechActivityScore(
            int vL,
            int nR,
            double lambda)
    {
        double p = 0.5;
        double speechActivityScore
            = Math.log(binomialCoefficient(nR, vL)) + vL * Math.log(p)
                + (nR - vL) * Math.log(1 - p) - Math.log(lambda) + lambda * vL;

        if (speechActivityScore < MIN_SPEECH_ACTIVITY_SCORE)
        {
            speechActivityScore = MIN_SPEECH_ACTIVITY_SCORE;
        }
        return speechActivityScore;
    }

    /**
     * The background task which repeatedly makes the (global) decision about speaker switches.
     */
    private DecisionMaker decisionMaker;

    /**
     * The identifier of the dominant speaker.
     */
    private T dominantId;

    /**
     * The last/latest time at which this DominantSpeakerIdentification
     * made a (global) decision about speaker switches. The (global) decision
     * about switcher switches should be made every {@link #DECISION_INTERVAL}
     * milliseconds.
     */
    private long lastDecisionTime;

    /**
     * The time in milliseconds of the most recent (audio) level report or
     * measurement (regardless of the Speaker).
     */
    private long lastLevelChangedTime;

    /**
     * The last/latest time at which this DominantSpeakerIdentification
     * notified the Speakers who have not received or measured audio
     * levels for a certain time (i.e. {@link #LEVEL_IDLE_TIMEOUT}) that they
     * will very likely not have a level within a certain time-frame of the
     * algorithm.
     */
    private long lastLevelIdleTime;

    /**
     * The relative speech activities for the immediate, medium and long
     * time-intervals, respectively, which were last calculated for a
     * Speaker. Simply reduces the number of allocations and the
     * penalizing effects of the garbage collector.
     */
    private final double[] relativeSpeechActivities = new double[3];

    /**
     * The Speakers in the multipoint conference associated with this
     * ActiveSpeakerDetector.
     */
    private final Map> speakers = new HashMap<>();

    /**
     * The special ID to use to indicate silence (no speakers).
     *
     * When set to null silence detection is disabled.
     */
    private T silenceId = null;

    /**
     * The interval (in milliseconds) of no activity before switching to {@link #silenceId} (if silence detection is
     * enabled).
     */
    private long timeoutToSilenceInterval = DEFAULT_TIMEOUT_TO_SILENCE_INTERVAL;

    /**
     * The Speakers in the multipoint conference with the highest
     * current energy levels.
     */
    private final ArrayList> loudest = new ArrayList<>();

    private final Clock clock;

    /**
     * The executor used to schedule {@link #decisionMaker}.
     */
    private final ScheduledExecutorService executor;

    /**
     * The number of current loudest speakers to keep track of.
     */
    private int numLoudestToTrack = 0;

    /**
     * Time in milliseconds after which speaker is removed from loudest list if
     * no new audio packets have been received from that speaker.
     */
    private int energyExpireTimeMs = 150;

    /**
     * Alpha factor for exponential smoothing of energy values, multiplied by 100.
     */
    private int energyAlphaPct = 50;

    /**
     * Initializes a new DominantSpeakerIdentification instance.
     */
    public DominantSpeakerIdentification()
    {
        this(Clock.systemUTC(), DEFAULT_EXECUTOR);
    }

    public DominantSpeakerIdentification(Clock clock, ScheduledExecutorService executor)
    {
        this.clock = clock;
        this.executor = executor;
    }

    /**
     * Set energy ranking options
     */
    public synchronized void setLoudestConfig(int numLoudestToTrack_, int energyExpireTimeMs_, int energyAlphaPct_)
    {
        numLoudestToTrack = numLoudestToTrack_;
        energyExpireTimeMs = energyExpireTimeMs_;
        energyAlphaPct = energyAlphaPct_;
        logger.trace(() -> "numLoudestToTrack = " + numLoudestToTrack);
        logger.trace(() -> "energyExpireTimeMs = " + energyExpireTimeMs);
        logger.trace(() -> "energyAlphaPct = " + energyAlphaPct);

        while (loudest.size() > numLoudestToTrack)
            loudest.remove(numLoudestToTrack);
    }

    /**
     * Enable silence detection with the default interval. See {@link #enableSilenceDetection(Object, long)}.
     * @param silenceId the special ID to use for silence.
     */
    public void enableSilenceDetection(@NotNull T silenceId)
    {
        enableSilenceDetection(silenceId, DEFAULT_TIMEOUT_TO_SILENCE_INTERVAL);
    }

    /**
     * Enable silence detection. When enabled and the dominant speaker is silent for {@code timeout} ms, this DSI will
     * fire a dominant speaker changed event with ID {@code silenceId}.
     * @param silenceId the special ID to use for silence
     * @param timeout the interval in milliseconds of no activity before switching to {@code silenceId}.
     */
    public synchronized void enableSilenceDetection(@NotNull T silenceId, long timeout)
    {
        if (this.silenceId != null)
        {
            throw new IllegalStateException("Silence detection already enabled");
        }

        logger.debug(() -> "Enabling silence detection with ID " + silenceId + " and timeout " + timeout + " ms.");
        this.silenceId = silenceId;
        timeoutToSilenceInterval = timeout;
    }

    /**
     * Notifies this DominantSpeakerIdentification instance that a
     * specific DecisionMaker has permanently stopped executing (in its
     * background/daemon Thread). If the specified
     * decisionMaker is the one utilized by this
     * DominantSpeakerIdentification instance, the latter will update
     * its state to reflect that the former has exited.
     *
     * @param decisionMaker the DecisionMaker which has exited
     */
    synchronized void decisionMakerExited(DecisionMaker decisionMaker)
    {
        if (this.decisionMaker == decisionMaker)
        {
            this.decisionMaker = null;
        }
    }

    /**
     * Retrieves a JSON representation of this instance for the purposes of the
     * REST API of Videobridge.
     * 

* By the way, the method name reflects the fact that the method handles an * HTTP GET request. *

* * @return a JSONObject which represents this instance of the * purposes of the REST API of Videobridge */ @SuppressWarnings("unchecked") public JSONObject doGetJSON() { JSONObject jsonObject; if (DEBUG) { synchronized (this) { jsonObject = new JSONObject(); // dominantSpeaker T dominantSpeaker = getDominantSpeaker(); jsonObject.put("dominantSpeaker", Objects.toString(dominantSpeaker)); // speakers Collection> speakersCollection = this.speakers.values(); JSONArray speakersArray = new JSONArray(); for (Speaker speaker : speakersCollection) { JSONObject speakerJSONObject = new JSONObject(); // id speakerJSONObject.put("id", speaker.id.toString()); // levels speakerJSONObject.put("levels", speaker.getLevels()); speakersArray.add(speakerJSONObject); } jsonObject.put("speakers", speakersArray); } } else { // Retrieving a JSON representation of a // DominantSpeakerIdentification has been implemented for the // purposes of debugging only. jsonObject = null; } return jsonObject; } /** * Gets the identifier of the dominant speaker. */ public T getDominantSpeaker() { return dominantId; } /** * Gets the Speaker in this multipoint conference identified by a * specific {@code id}. If no such Speaker exists, a new Speaker * is initialized and returned. * * @param id the identifier of the Speaker to return. * @return the Speaker in this multipoint conference identified by {@code id}. */ @NotNull private synchronized Speaker getOrCreateSpeaker(T id) { Speaker speaker = speakers.get(id); if (speaker == null) { speaker = new Speaker<>(id); speakers.put(id, speaker); // Since we've created a new Speaker in the multipoint conference, // we'll very likely need to make a decision whether there have been // speaker switch events soon. maybeStartDecisionMaker(); } return speaker; } /** * Update loudest speaker list. * @param speaker the speaker with a new energy level * @param level the energy level * @param now the current time * @return The current ranking statistics. */ private synchronized SpeakerRanking updateLoudestList(Speaker speaker, int level, long now) { boolean isDominant = dominantId != null && dominantId.equals(speaker.id); if (level < 0) { /* Ignore this level, it is too old. Just gather the stats. */ int rank = 0; while (rank < loudest.size() && loudest.get(rank) != speaker) ++rank; return new SpeakerRanking(isDominant, rank, speaker.energyScore); } /* exponential smoothing. round to nearest. */ speaker.energyScore = (energyAlphaPct * level + (100 - energyAlphaPct) * speaker.energyScore + 50) / 100; if (numLoudestToTrack == 0) return new SpeakerRanking(isDominant, 0, speaker.energyScore); long oldestValid = now - energyExpireTimeMs; logger.trace(() -> "Want to add " + speaker.id.toString() + " with score " + speaker.energyScore + ". Last level = " + level + "."); int i = 0; while (i < loudest.size()) { Speaker cur = loudest.get(i); if (cur.getLastLevelChangedTime() < oldestValid) { logger.trace(() -> "Removing " + cur.id.toString() + ". old."); loudest.remove(i); continue; } if (cur == speaker) { logger.trace(() -> "Removing " + cur.id.toString() + ". same."); loudest.remove(i); continue; } ++i; } int rank = 0; while (rank < loudest.size()) { Speaker cur = loudest.get(rank); if (cur.energyScore < speaker.energyScore) break; ++rank; } if (rank < numLoudestToTrack) { final int pos = rank; logger.trace(() -> "Adding " + speaker.id.toString() + " at position " + pos + "."); loudest.add(rank, speaker); if (loudest.size() > numLoudestToTrack) loudest.remove(numLoudestToTrack); } if (logger.isTraceEnabled()) { i = 0; while (i < loudest.size()) { Speaker cur = loudest.get(i); logger.trace("New list: " + i + ": " + cur.id.toString() + ": " + cur.energyScore + "."); ++i; } } return new SpeakerRanking(isDominant, rank, speaker.energyScore); } /** * Query whether a particular endpoint is currently one of the loudest speakers. */ public synchronized boolean isAmongLoudest(T id) { return loudest.stream().anyMatch(speaker -> speaker.id.equals(id)); } /** * {@inheritDoc} */ @Override public SpeakerRanking levelChanged(T id, int level) { Speaker speaker; long now = clock.millis(); synchronized (this) { speaker = getOrCreateSpeaker(id); // Note that this ActiveSpeakerDetector is still in use. When it is // not in use long enough, its DecisionMaker i.e. background thread // will prepare itself and, consequently, this // DominantSpeakerIdentification for garbage collection. if (lastLevelChangedTime < now) { lastLevelChangedTime = now; // A report or measurement of an audio level indicates that this // DominantSpeakerIdentification is in use and, consequently, // that it'll very likely need to make a decision whether there // have been speaker switch events soon. maybeStartDecisionMaker(); } } int cookedLevel = speaker.levelChanged(level, now); return updateLoudestList(speaker, cookedLevel, now); } /** * Makes the decision whether there has been a speaker switch event. If * there has been such an event, notifies the registered listeners that a * new speaker is dominating the multipoint conference. */ private void makeDecision(long now) { // If we have to fire events to any registered listeners eventually, we // will want to do it outside the synchronized block. T oldDominantSpeakerValue = null, newDominantSpeakerValue = null; synchronized (this) { int speakerCount = speakers.size(); T newDominantId; if (speakerCount == 0) { // If there are no Speakers in a multipoint conference, then there are no speaker switch events to detect. // We either have no dominant speaker, or we're in a silence period (if silence detection is enabled). newDominantId = silenceId; } else if (speakerCount == 1) { // If there is a single Speaker in a multipoint conference, then it is either the dominant speaker, or // we're in a silence period (if silence detection is enabled). Speaker speaker = speakers.values().iterator().next(); newDominantId = speaker.id; if (silenceId != null) { long timeSinceNonSilence = now - speaker.lastNonSilence; if (timeSinceNonSilence > timeoutToSilenceInterval) { newDominantId = silenceId; } } } else { boolean inSilence = silenceId != null && dominantId == silenceId; Speaker dominantSpeaker = (dominantId == null) ? null : speakers.get(dominantId); // If there is no dominant speaker, nominate one at random and then // let the other speakers compete with the nominated one. if (dominantSpeaker == null && !inSilence) { Map.Entry> s = speakers.entrySet().iterator().next(); dominantSpeaker = s.getValue(); newDominantId = s.getKey(); } else if (inSilence) { newDominantId = silenceId; } else { newDominantId = null; } // At this point dominantSpeaker==null iff inSilence==true. if (dominantSpeaker != null) { dominantSpeaker.evaluateSpeechActivityScores(now); } double[] relativeSpeechActivities = this.relativeSpeechActivities; // If multiple speakers cause speaker switches, they compete among // themselves by their relative speech activities in the middle // time-interval. double newDominantC2 = C2; for (Map.Entry> s : speakers.entrySet()) { Speaker speaker = s.getValue(); // The dominant speaker does not compete with itself. In other // words, there is no use detecting a speaker switch from the // dominant speaker to the dominant speaker. Technically, the // relative speech activities are all zeroes for the dominant // speaker. if (speaker == dominantSpeaker) { continue; } speaker.evaluateSpeechActivityScores(now); // Compute the relative speech activities for the immediate, // medium and long time-intervals. for (int interval = 0; interval < relativeSpeechActivities.length; ++interval) { // When in a silence period we use MIN_SPEECH_ACTIVITY_SCORE as the scores to compete against. double dominantSpeakerScore = dominantSpeaker == null ? MIN_SPEECH_ACTIVITY_SCORE : dominantSpeaker.getSpeechActivityScore(interval); relativeSpeechActivities[interval] = Math.log(speaker.getSpeechActivityScore(interval) / dominantSpeakerScore); } double c1 = relativeSpeechActivities[0]; double c2 = relativeSpeechActivities[1]; double c3 = relativeSpeechActivities[2]; if ((c1 > C1) && (c2 > C2) && (c3 > C3) && (c2 > newDominantC2)) { // If multiple speakers cause speaker switches, they compete // among themselves by their relative speech activities in // the middle time-interval. newDominantC2 = c2; newDominantId = s.getKey(); } } if (silenceId != null && !inSilence && newDominantId == null && dominantSpeaker != null) { // We're not in a silence period, and none of the non-dominant speakers won the challenge. Check if // the current dominant speaker has been silent for the timeout period, and if so switch to "silence" // mode. long timeSinceNonSilence = now - dominantSpeaker.lastNonSilence; if (timeSinceNonSilence > timeoutToSilenceInterval) { newDominantId = silenceId; } } } if ((newDominantId != null) && !newDominantId.equals(dominantId)) { oldDominantSpeakerValue = dominantId; dominantId = newDominantId; newDominantSpeakerValue = dominantId; } } // synchronized (this) // Now that we are outside the synchronized block, fire events, if any, // to any registered listeners. if ((newDominantSpeakerValue != null) && !newDominantSpeakerValue.equals(oldDominantSpeakerValue)) { fireActiveSpeakerChanged(newDominantSpeakerValue); } } /** * Starts a background thread which is to repeatedly make the (global) * decision about speaker switches if such a background thread has not been * started yet and if the current state of this * DominantSpeakerIdentification justifies the start of such a * background thread (e.g. there is at least one Speaker in this * multipoint conference). */ private synchronized void maybeStartDecisionMaker() { if ((this.decisionMaker == null) && !speakers.isEmpty()) { DecisionMaker decisionMaker = new DecisionMaker(this); boolean scheduled = false; this.decisionMaker = decisionMaker; try { executor.execute(decisionMaker); scheduled = true; } finally { if (!scheduled && (this.decisionMaker == decisionMaker)) { this.decisionMaker = null; } } } } /** * Runs in the background/daemon Thread of {@link #decisionMaker} * and makes the decision whether there has been a speaker switch event. * * @return a negative integer if the DecisionMaker is to exit or * a non-negative integer to specify the time in milliseconds until the next * execution of the DecisionMaker */ private long runInDecisionMaker() { long now = clock.millis(); long levelIdleTimeout = LEVEL_IDLE_TIMEOUT - (now - lastLevelIdleTime); long sleep = 0; if (levelIdleTimeout <= 0) { if (lastLevelIdleTime != 0) { timeoutIdleLevels(now); } lastLevelIdleTime = now; } else { sleep = levelIdleTimeout; } long decisionTimeout = DECISION_INTERVAL - (now - lastDecisionTime); if (decisionTimeout <= 0) { // The identification of the dominant active speaker may be a // time-consuming ordeal so the time of the last decision is the // time of the beginning of a decision iteration. lastDecisionTime = now; makeDecision(now); // The identification of the dominant active speaker may be a // time-consuming ordeal so the timeout to the next decision // iteration should be computed after the end of the decision // iteration. decisionTimeout = DECISION_INTERVAL - (clock.millis() - now); } if ((decisionTimeout > 0) && (sleep > decisionTimeout)) { sleep = decisionTimeout; } return sleep; } /** * Runs in the background/daemon Thread of a specific * DecisionMaker and makes the decision whether there has been a * speaker switch event. * * @param decisionMaker the DecisionMaker invoking the method * @return a negative integer if the decisionMaker is to exit or * a non-negative integer to specify the time in milliseconds until the next * execution of the decisionMaker */ long runInDecisionMaker(DecisionMaker decisionMaker) { synchronized (this) { // Most obviously, DecisionMakers no longer employed by this // DominantSpeakerIdentification should cease to exist as soon as // possible. if (this.decisionMaker != decisionMaker) { return -1; } // If the decisionMaker has been unnecessarily executing long // enough, kill it in order to have a more deterministic behavior // with respect to disposal. if (0 < lastDecisionTime) { long idle = lastDecisionTime - lastLevelChangedTime; if (idle >= DECISION_MAKER_IDLE_TIMEOUT) { return -1; } } } return runInDecisionMaker(); } /** * Notifies the Speakers in this multipoint conference who have not * received or measured audio levels for a certain time (i.e. * {@link #LEVEL_IDLE_TIMEOUT}) that they will very likely not have a level * within a certain time-frame of the DominantSpeakerIdentification * algorithm. Additionally, removes the non-dominant Speakers who * have not received or measured audio levels for far too long (i.e. * {@link #SPEAKER_IDLE_TIMEOUT}). * * @param now the time at which the timing out is being detected */ private synchronized void timeoutIdleLevels(long now) { Iterator>> i = speakers.entrySet().iterator(); while (i.hasNext()) { Speaker speaker = i.next().getValue(); long idle = now - speaker.getLastLevelChangedTime(); // Remove a non-dominant Speaker if he/she has been idle for far too // long. if ((SPEAKER_IDLE_TIMEOUT < idle) && ((dominantId == null) || (speaker.id != dominantId))) { i.remove(); } else if (LEVEL_IDLE_TIMEOUT < idle) { speaker.levelTimedOut(); } } } /** * Represents the background thread which repeatedly makes the (global) * decision about speaker switches. Weakly references an associated * DominantSpeakerIdentification instance in order to eventually * detect that the multipoint conference has actually expired and that the * background Thread should perish. * * @author Lyubomir Marinov */ private static class DecisionMaker implements Runnable { /** * The DominantSpeakerIdentification instance which is * repeatedly run into this background thread in order to make the * (global) decision about speaker switches. It is a * WeakReference in order to eventually detect that the * mulipoint conference has actually expired and that this background * Thread should perish. */ private final WeakReference> algorithm; /** * Initializes a new DecisionMaker instance which is to * repeatedly run a specific DominantSpeakerIdentification * into a background thread in order to make the (global) decision about * speaker switches. * * @param algorithm the DominantSpeakerIdentification to be * repeatedly run by the new instance in order to make the (global) * decision about speaker switches */ public DecisionMaker(DominantSpeakerIdentification algorithm) { this.algorithm = new WeakReference<>(algorithm); } /** * Repeatedly runs {@link #algorithm} i.e. makes the (global) decision * about speaker switches until the multipoint conference expires. */ @Override public void run() { boolean exit = false; DominantSpeakerIdentification algorithm = this.algorithm.get(); if (algorithm == null) { exit = true; } else { long sleep; try { sleep = algorithm.runInDecisionMaker(this); } catch (Exception e) { // If an exception occurs we do not re-schedule. sleep = -1; } // A negative sleep value is contracted to mean that this DecisionMaker should not re-schedule itself. if (sleep < 0) { exit = true; } else { algorithm.executor.schedule(this, sleep, TimeUnit.MILLISECONDS); } } if (exit) { // Notify the algorithm that this DecisionMaker will no longer run. Subsequently, the algorithm may // decide to create and schedule another one if and when it's needed. algorithm = this.algorithm.get(); if (algorithm != null) { algorithm.decisionMakerExited(this); } } } } /** * Represents a speaker in a multipoint conference identified by an ID. * * @author Lyubomir Marinov */ private class Speaker { private final byte[] immediates = new byte[LONG_COUNT * N3 * N2]; /** * The last {@link #clock} time in millis at which this speaker was not silent. We consider it being silent if * {@link #longSpeechActivityScore} == {@link #MIN_SPEECH_ACTIVITY_SCORE} to allow short bursts of activity * to not interrupt a silence period. */ private long lastNonSilence = -1; /** * The speech activity score of this Speaker for the immediate * time-interval. */ private double immediateSpeechActivityScore = MIN_SPEECH_ACTIVITY_SCORE; /** * The time in milliseconds of the most recent invocation of * {@link #levelChanged(int,long)} i.e. the last time at which an actual * (audio) level was reported or measured for this Speaker. If * no level is reported or measured for this Speaker long * enough i.e. {@link #LEVEL_IDLE_TIMEOUT}, the associated * DominantSpeakerIdentification will presume that this * Speaker was muted for the duration of a certain frame. */ private long lastLevelChangedTime = clock.millis(); /** * The (history of) audio levels received or measured for this * Speaker. */ private final byte[] levels; private final byte[] longs = new byte[LONG_COUNT]; /** * The speech activity score of this Speaker for the long * time-interval. */ private double longSpeechActivityScore = MIN_SPEECH_ACTIVITY_SCORE; private final byte[] mediums = new byte[LONG_COUNT * N3]; /** * The speech activity score of this Speaker for the medium * time-interval. */ private double mediumSpeechActivityScore = MIN_SPEECH_ACTIVITY_SCORE; /** * The minimum (audio) level received or measured for this * Speaker. Since MIN_LEVEL is specified for samples * generated by a muted audio source, a value equal to * MIN_LEVEL indicates that the minimum level for this * Speaker has not been determined yet. */ private byte minLevel = MIN_LEVEL; /** * The (current) estimate of the minimum (audio) level received or * measured for this Speaker. Used to increase the value of * {@link #minLevel} */ private byte nextMinLevel = MIN_LEVEL; /** * The number of subsequent (audio) levels received or measured for this * Speaker which have been monitored thus far in order to * estimate an up-to-date minimum (audio) level received or measured for * this Speaker. */ private int nextMinLevelWindowLength; /** Exponential smoothing of filtered energy values. * Synchronized by parent class. */ int energyScore; /** * The identifier of this Speaker which is unique within this {@link DominantSpeakerIdentification}. */ public final U id; /** * Initializes a new Speaker instance with a specific identifier * * @param id the object identifying this speaker. * instance */ public Speaker(U id) { this.id = id; levels = new byte[immediates.length]; } private boolean computeImmediates() { // The minimum audio level received or measured for this Speaker is // the level of "silence" for this Speaker. Since the various // Speakers may differ in their levels of "silence", put all // Speakers on equal footing by replacing the individual levels of // "silence" with the uniform level of absolute silence. byte[] immediates = this.immediates; byte[] levels = this.levels; byte minLevel = (byte) (this.minLevel + N1_SUBUNIT_LENGTH); boolean changed = false; for (int i = 0; i < immediates.length; ++i) { byte level = levels[i]; if (level < minLevel) level = MIN_LEVEL; byte immediate = (byte) (level / N1_SUBUNIT_LENGTH); if (immediates[i] != immediate) { immediates[i] = immediate; changed = true; } } return changed; } private boolean computeLongs() { return computeBigs(mediums, longs, LONG_THRESHOLD); } private boolean computeMediums() { return computeBigs(immediates, mediums, MEDIUM_THRESHOLD); } /** * Computes/evaluates the speech activity score of this Speaker * for the immediate time-interval. */ private void evaluateImmediateSpeechActivityScore() { immediateSpeechActivityScore = computeSpeechActivityScore(immediates[0], N1, 0.78); } /** * Computes/evaluates the speech activity score of this Speaker * for the long time-interval. */ private void evaluateLongSpeechActivityScore(long now) { longSpeechActivityScore = computeSpeechActivityScore(longs[0], N3, 47); if (longSpeechActivityScore > MIN_SPEECH_ACTIVITY_SCORE) { lastNonSilence = now; } } /** * Computes/evaluates the speech activity score of this Speaker * for the medium time-interval. */ private void evaluateMediumSpeechActivityScore() { mediumSpeechActivityScore = computeSpeechActivityScore(mediums[0], N2, 24); } /** * Evaluates the speech activity scores of this Speaker for the * immediate, medium, and long time-intervals. Invoked when it is time * to decide whether there has been a speaker switch event. */ synchronized void evaluateSpeechActivityScores(long now) { if (computeImmediates()) { evaluateImmediateSpeechActivityScore(); if (computeMediums()) { evaluateMediumSpeechActivityScore(); if (computeLongs()) { evaluateLongSpeechActivityScore(now); } } } } /** * Gets the time in milliseconds at which an actual (audio) level was * reported or measured for this Speaker last. * * @return the time in milliseconds at which an actual (audio) level * was reported or measured for this Speaker last */ public synchronized long getLastLevelChangedTime() { return lastLevelChangedTime; } /** * Gets the (history of) audio levels received or measured for this * Speaker. * * @return a String that lists the (history of) audio * levels received or measured for this Speaker */ String getLevels() { // The levels of Speaker are internally maintained starting with the // last audio level received or measured for this Speaker and ending // with the first audio level received or measured for this Speaker. // Reverse the list and print them in the order they were received. byte[] src = this.levels; StringBuilder sb = new StringBuilder(); sb.append('['); for (int s = src.length - 1; s >= 0; --s) { sb.append(src[s]); sb.append(','); } sb.setCharAt(sb.length() - 1, ']'); /* replace last comma */ return sb.toString(); } /** * Gets the speech activity score of this Speaker for a * specific time-interval. * * @param interval 0 for the immediate time-interval, * 1 for the medium time-interval, or 2 for the long * time-interval * @return the speech activity score of this Speaker for the * time-interval specified by index */ double getSpeechActivityScore(int interval) { switch (interval) { case 0: return immediateSpeechActivityScore; case 1: return mediumSpeechActivityScore; case 2: return longSpeechActivityScore; default: throw new IllegalArgumentException("interval " + interval); } } /** * Notifies this Speaker that a new audio level has been * received or measured at a specific time. * * @param level the audio level which has been received or measured for * this Speaker * @param time the (local System) time in milliseconds at which * the specified level has been received or measured * @return the audio level that was applied, after any filtering or * level adjustment has taken place. If negative, the audio level * was ignored. */ public synchronized int levelChanged(int level, long time) { // It sounds relatively reasonable that late audio levels should // better be discarded. if (lastLevelChangedTime <= time) { lastLevelChangedTime = time; // Ensure that the specified level is within the supported // range. byte b; if (level < MIN_LEVEL) b = MIN_LEVEL; else if (level > MAX_LEVEL) b = MAX_LEVEL; else b = (byte) level; // Push the specified level into the history of audio levels // received or measured for this Speaker. System.arraycopy(levels, 0, levels, 1, levels.length - 1); levels[0] = b; // Determine the minimum level received or measured for this // Speaker. updateMinLevel(b); return b >= (minLevel + N1_SUBUNIT_LENGTH) ? b : 0; } else { return -1; } } /** * Notifies this Speaker that no new audio level has been * received or measured for a certain time which very likely means that * this Speaker will not have a level within a certain * time-frame of a DominantSpeakerIdentification algorithm. */ public synchronized void levelTimedOut() { levelChanged(MIN_LEVEL, lastLevelChangedTime); } /** * Updates the minimum (audio) level received or measured for this * Speaker in light of the receipt of a specific level. * * @param level the audio level received or measured for this * Speaker */ private void updateMinLevel(byte level) { if (level != MIN_LEVEL) { if ((minLevel == MIN_LEVEL) || (minLevel > level)) { minLevel = level; nextMinLevel = MIN_LEVEL; nextMinLevelWindowLength = 0; } else { // The specified (audio) level is greater than the minimum // level received or measure for this Speaker. However, the // minimum level may be out-of-date by now. Estimate an // up-to-date minimum level and, eventually, make it the // minimum level received or measured for this Speaker. if (nextMinLevel == MIN_LEVEL) { nextMinLevel = level; nextMinLevelWindowLength = 1; } else { if (nextMinLevel > level) { nextMinLevel = level; } nextMinLevelWindowLength++; if (nextMinLevelWindowLength >= MIN_LEVEL_WINDOW_LENGTH) { // The arithmetic mean will increase the minimum // level faster than the geometric mean. Since the // goal is to track a minimum, it sounds reasonable // to go with a slow increase. double newMinLevel = Math.sqrt(minLevel * (double) nextMinLevel); // Ensure that the new minimum level is within the // supported range. if (newMinLevel < MIN_LEVEL) newMinLevel = MIN_LEVEL; else if (newMinLevel > MAX_LEVEL) newMinLevel = MAX_LEVEL; minLevel = (byte) newMinLevel; nextMinLevel = MIN_LEVEL; nextMinLevelWindowLength = 0; } } } } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy