core.be.tarsos.dsp.WaveformSimilarityBasedOverlapAdd Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of TarsosDSP Show documentation
TarsosDSP library fork
The newest version!
/*
*      _______                       _____   _____ _____  
*     |__   __|                     |  __ \ / ____|  __ \ 
*        | | __ _ _ __ ___  ___  ___| |  | | (___ | |__) |
*        | |/ _` | '__/ __|/ _ \/ __| |  | |\___ \|  ___/ 
*        | | (_| | |  \__ \ (_) \__ \ |__| |____) | |     
*        |_|\__,_|_|  |___/\___/|___/_____/|_____/|_|     
*                                                         
* -------------------------------------------------------------
*
* TarsosDSP is developed by Joren Six at IPEM, University Ghent
*  
* -------------------------------------------------------------
*
*  Info: http://0110.be/tag/TarsosDSP
*  Github: https://github.com/JorenSix/TarsosDSP
*  Releases: http://0110.be/releases/TarsosDSP/
*  
*  TarsosDSP includes modified source code by various authors,
*  for credits and info, see README.
* 
*/


package be.tarsos.dsp;


/**
 *
 * 
 * An overlap-add technique based on waveform similarity (WSOLA) for high
 * quality time-scale modification of speech
 * 
 * 
 * A concept of waveform similarity for tackling the problem of time-scale
 * modification of speech is proposed. It is worked out in the context of
 * short-time Fourier transform representations. The resulting WSOLA
 * (waveform-similarity-based synchronized overlap-add) algorithm produces
 * high-quality speech output, is algorithmically and computationally efficient
 * and robust, and allows for online processing with arbitrary time-scaling
 * factors that may be specified in a time-varying fashion and can be chosen
 * over a wide continuous range of values.
 * 
 * 
 * Inspired by the work soundtouch by Olli Parviainen,
 * http://www.surina.net/soundtouch, especially the TDStrech.cpp file.
 * 
 * @author Joren Six
 * @author Olli Parviainen
 */
public class WaveformSimilarityBasedOverlapAdd implements AudioProcessor {	
	private int seekWindowLength;
	private int seekLength;
	private int overlapLength;
	
	private float[] pMidBuffer;	
	private float[] pRefMidBuffer;
	private float[] outputFloatBuffer;
	
	private int intskip;
	private int sampleReq; 
	
	private double tempo;
	
	private AudioDispatcher dispatcher;

	private Parameters newParameters;
	
	/**
	 * Create a new instance based on algorithm parameters for a certain audio format.
	 * @param params The parameters for the algorithm.
	 */
	public WaveformSimilarityBasedOverlapAdd(Parameters  params){
		setParameters(params);
		applyNewParameters();
	}
	
	public void setParameters(Parameters params){
		newParameters = params;
	}
	
	public void setDispatcher(AudioDispatcher newDispatcher){
		this.dispatcher = newDispatcher;
	}
	
	private void applyNewParameters(){
		Parameters params = newParameters;
		int oldOverlapLength = overlapLength;
		overlapLength = (int) ((params.getSampleRate() * params.getOverlapMs())/1000);
		seekWindowLength = (int) ((params.getSampleRate() * params.getSequenceMs())/1000);
		seekLength = (int) ((params.getSampleRate() *  params.getSeekWindowMs())/1000);
		
		tempo = params.getTempo();
		
		//pMidBuffer and pRefBuffer are initialized with 8 times the needed length to prevent a reset
		//of the arrays when overlapLength changes.
		
		if(overlapLength > oldOverlapLength * 8 && pMidBuffer==null){
			pMidBuffer = new float[overlapLength * 8]; //overlapLengthx2?
			pRefMidBuffer = new float[overlapLength * 8];//overlapLengthx2?
			System.out.println("New overlapLength" + overlapLength);
		}
		
		double nominalSkip = tempo * (seekWindowLength - overlapLength);
		intskip = (int) (nominalSkip + 0.5);
		
		sampleReq = Math.max(intskip + overlapLength, seekWindowLength) + seekLength;
		
		float[] prevOutputBuffer = outputFloatBuffer;
		outputFloatBuffer = new float[getOutputBufferSize()];
		if(prevOutputBuffer!=null){
			System.out.println("Copy outputFloatBuffer contents");
			for(int i = 0 ; i < prevOutputBuffer.length && i < outputFloatBuffer.length ; i++){
			 outputFloatBuffer[i] = prevOutputBuffer[i];
			}
		}
		
		newParameters = null;
	}
	
	public int getInputBufferSize(){
		return sampleReq;
	}
	
	private int getOutputBufferSize(){
		return seekWindowLength - overlapLength;
	}
	
	public int getOverlap(){
		return sampleReq-intskip;
	}
	
	
	/**
	 * Overlaps the sample in output with the samples in input.
	 * @param output The output buffer.
	 * @param input The input buffer.
	 */
	private void overlap(final float[] output, int outputOffset, float[] input,int inputOffset){
		for(int i = 0 ; i < overlapLength ; i++){
			int itemp = overlapLength - i;
			output[i + outputOffset] = (input[i + inputOffset] * i + pMidBuffer[i] * itemp ) / overlapLength;  
		}
	}
	
	
	/**
	 * Seeks for the optimal overlap-mixing position.
	 * 
	 * The best position is determined as the position where the two overlapped
	 * sample sequences are 'most alike', in terms of the highest
	 * cross-correlation value over the overlapping period
	 * 
	 * @param inputBuffer The input buffer
	 * @param postion The position where to start the seek operation, in the input buffer. 
	 * @return The best position.
	 */
	private int seekBestOverlapPosition(float[] inputBuffer, int postion) {
		int bestOffset;
		double bestCorrelation, currentCorrelation;
		int tempOffset;

		int comparePosition;

		// Slopes the amplitude of the 'midBuffer' samples
		precalcCorrReferenceMono();

		bestCorrelation = -10;
		bestOffset = 0;

		// Scans for the best correlation value by testing each possible
		// position
		// over the permitted range.
		for (tempOffset = 0; tempOffset < seekLength; tempOffset++) {

			comparePosition = postion + tempOffset;

			// Calculates correlation value for the mixing position
			// corresponding
			// to 'tempOffset'
			currentCorrelation = (double) calcCrossCorr(pRefMidBuffer, inputBuffer,comparePosition);
			// heuristic rule to slightly favor values close to mid of the
			// range
			double tmp = (double) (2 * tempOffset - seekLength) / seekLength;
			currentCorrelation = ((currentCorrelation + 0.1) * (1.0 - 0.25 * tmp * tmp));

			// Checks for the highest correlation value
			if (currentCorrelation > bestCorrelation) {
				bestCorrelation = currentCorrelation;
				bestOffset = tempOffset;
			}
		}

		return bestOffset;

	}
	
	/**
	* Slopes the amplitude of the 'midBuffer' samples so that cross correlation
	* is faster to calculate. Why is this faster?
	*/
	void precalcCorrReferenceMono()
	{
	    for (int i = 0; i < overlapLength; i++){
	    	float temp = i * (overlapLength - i);
	        pRefMidBuffer[i] = pMidBuffer[i] * temp;
	    }
	}	

	
	double calcCrossCorr(float[] mixingPos, float[] compare, int offset){
		double corr = 0;
	    double norm = 0;
	    for (int i = 1; i < overlapLength; i ++){
	        corr += mixingPos[i] * compare[i + offset];
	        norm += mixingPos[i] * mixingPos[i];
	    }
	    // To avoid division by zero.
	    if (norm < 1e-8){
	    	norm = 1.0;    
	    }
	    return corr / Math.pow(norm,0.5);
	}
	
	
	@Override
	public boolean process(AudioEvent audioEvent) {
		float[] audioFloatBuffer = audioEvent.getFloatBuffer();
		assert audioFloatBuffer.length == getInputBufferSize();
		
		//Search for the best overlapping position.
		int offset =  seekBestOverlapPosition(audioFloatBuffer,0);
		
		// Mix the samples in the 'inputBuffer' at position of 'offset' with the 
        // samples in 'midBuffer' using sliding overlapping
        // ... first partially overlap with the end of the previous sequence
        // (that's in 'midBuffer')
		overlap(outputFloatBuffer,0,audioFloatBuffer,offset);
			
		//copy sequence samples from input to output			
		int sequenceLength = seekWindowLength - 2 * overlapLength;
		System.arraycopy(audioFloatBuffer, offset + overlapLength, outputFloatBuffer, overlapLength, sequenceLength);
		
	     // Copies the end of the current sequence from 'inputBuffer' to 
        // 'midBuffer' for being mixed with the beginning of the next 
        // processing sequence and so on
		System.arraycopy(audioFloatBuffer, offset + sequenceLength + overlapLength, pMidBuffer, 0, overlapLength);
		
		assert outputFloatBuffer.length == getOutputBufferSize();
		
		audioEvent.setFloatBuffer(outputFloatBuffer);
		audioEvent.setOverlap(0);
		
		if(newParameters!=null){
			applyNewParameters();
			dispatcher.setStepSizeAndOverlap(getInputBufferSize(),getOverlap());
		}
		
		return true;
	}

	@Override
	public void processingFinished() {
		// NOOP
	}


	
	/**
	 * An object to encapsulate some of the parameters for
	 *         WSOLA, together with a couple of practical helper functions.
	 * 
	 * @author Joren Six
	 */
	public static class Parameters {
		private final int sequenceMs;
		private final int seekWindowMs;
		private final int overlapMs;
		
		private final double tempo;
		private final double sampleRate;

		/**
		 * @param tempo
		 *            The tempo change 1.0 means unchanged, 2.0 is + 100% , 0.5
		 *            is half of the speed.
		 * @param sampleRate
		 *            The sample rate of the audio 44.1kHz is common.
		 * @param newSequenceMs
		 *            Length of a single processing sequence, in milliseconds.
		 *            This determines to how long sequences the original sound
		 *            is chopped in the time-stretch algorithm.
		 * 
		 *            The larger this value is, the lesser sequences are used in
		 *            processing. In principle a bigger value sounds better when
		 *            slowing down tempo, but worse when increasing tempo and
		 *            vice versa.
		 * 
		 *            Increasing this value reduces computational burden & vice
		 *            versa.
		 * @param newSeekWindowMs
		 *            Seeking window length in milliseconds for algorithm that
		 *            finds the best possible overlapping location. This
		 *            determines from how wide window the algorithm may look for
		 *            an optimal joining location when mixing the sound
		 *            sequences back together.
		 * 
		 *            The bigger this window setting is, the higher the
		 *            possibility to find a better mixing position will become,
		 *            but at the same time large values may cause a "drifting"
		 *            artifact because consequent sequences will be taken at
		 *            more uneven intervals.
		 * 
		 *            If there's a disturbing artifact that sounds as if a
		 *            constant frequency was drifting around, try reducing this
		 *            setting.
		 * 
		 *            Increasing this value increases computational burden &
		 *            vice versa.
		 * @param newOverlapMs
		 *            Overlap length in milliseconds. When the chopped sound
		 *            sequences are mixed back together, to form a continuous
		 *            sound stream, this parameter defines over how long period
		 *            the two consecutive sequences are let to overlap each
		 *            other.
		 * 
		 *            This shouldn't be that critical parameter. If you reduce
		 *            the DEFAULT_SEQUENCE_MS setting by a large amount, you
		 *            might wish to try a smaller value on this.
		 * 
		 *            Increasing this value increases computational burden &
		 *            vice versa.
		 */
		public Parameters(double tempo, double sampleRate, int newSequenceMs, int newSeekWindowMs, int newOverlapMs) {
			this.tempo = tempo;
			this.sampleRate = sampleRate;
			this.overlapMs = newOverlapMs;
			this.seekWindowMs = newSeekWindowMs;
			this.sequenceMs = newSequenceMs;
		}
		
		public static Parameters speechDefaults(double tempo, double sampleRate){
			int sequenceMs = 40;
			int seekWindowMs = 15;
			int overlapMs = 12;
			return new Parameters(tempo,sampleRate,sequenceMs, seekWindowMs,overlapMs);
		}
		
		public static Parameters musicDefaults(double tempo, double sampleRate){
			int sequenceMs = 82;
			int seekWindowMs =  28;
			int overlapMs = 12;
			return new Parameters(tempo,sampleRate,sequenceMs, seekWindowMs,overlapMs);
		}
		
		public static Parameters slowdownDefaults(double tempo, double sampleRate){
			int sequenceMs = 100;
			int seekWindowMs =  35;
			int overlapMs = 20;
			return new Parameters(tempo,sampleRate,sequenceMs, seekWindowMs,overlapMs);
		}
		
		public static Parameters automaticDefaults(double tempo, double sampleRate){
			double tempoLow = 0.5; // -50% speed
			double tempoHigh = 2.0; // +100% speed
			
			double sequenceMsLow = 125; //ms
			double sequenceMsHigh = 50; //ms
			double sequenceK = ((sequenceMsHigh - sequenceMsLow) / (tempoHigh - tempoLow));
			double sequenceC = sequenceMsLow - sequenceK * tempoLow;
			
			double seekLow = 25;// ms
			double seekHigh = 15;// ms
			double seekK =((seekHigh - seekLow) / (tempoHigh-tempoLow));
			double seekC = seekLow - seekK * seekLow;
			
			int sequenceMs = (int) (sequenceC + sequenceK * tempo + 0.5);
			int seekWindowMs =  (int) (seekC + seekK * tempo + 0.5);
			int overlapMs = 12;
			return new Parameters(tempo,sampleRate,sequenceMs, seekWindowMs,overlapMs);
		}

		public double getOverlapMs() {
			return overlapMs;
		}

		public double getSequenceMs() {
			return sequenceMs;
		}

		public double getSeekWindowMs() {
			return seekWindowMs;
		}
		
		public double getSampleRate() {
			return sampleRate;
		}
		
		public double getTempo(){
			return tempo;
		}
	}
}