All Downloads are FREE. Search and download functionalities are using the official Maven repository.

be.tarsos.dsp.WaveformSimilarityBasedOverlapAdd Maven / Gradle / Ivy

There is a newer version: 2.4-1
Show newest version
/*
*      _______                       _____   _____ _____  
*     |__   __|                     |  __ \ / ____|  __ \ 
*        | | __ _ _ __ ___  ___  ___| |  | | (___ | |__) |
*        | |/ _` | '__/ __|/ _ \/ __| |  | |\___ \|  ___/ 
*        | | (_| | |  \__ \ (_) \__ \ |__| |____) | |     
*        |_|\__,_|_|  |___/\___/|___/_____/|_____/|_|     
*                                                         
* -------------------------------------------------------------
*
* TarsosDSP is developed by Joren Six at IPEM, University Ghent
*  
* -------------------------------------------------------------
*
*  Info: http://0110.be/tag/TarsosDSP
*  Github: https://github.com/JorenSix/TarsosDSP
*  Releases: http://0110.be/releases/TarsosDSP/
*  
*  TarsosDSP includes modified source code by various authors,
*  for credits and info, see README.
* 
*/


package be.tarsos.dsp;


/**
 *
 * 

* An overlap-add technique based on waveform similarity (WSOLA) for high * quality time-scale modification of speech *

*

* A concept of waveform similarity for tackling the problem of time-scale * modification of speech is proposed. It is worked out in the context of * short-time Fourier transform representations. The resulting WSOLA * (waveform-similarity-based synchronized overlap-add) algorithm produces * high-quality speech output, is algorithmically and computationally efficient * and robust, and allows for online processing with arbitrary time-scaling * factors that may be specified in a time-varying fashion and can be chosen * over a wide continuous range of values. *

*

* Inspired by the work soundtouch by Olli Parviainen, * http://www.surina.net/soundtouch, especially the TDStrech.cpp file. *

* @author Joren Six * @author Olli Parviainen */ public class WaveformSimilarityBasedOverlapAdd implements AudioProcessor { private int seekWindowLength; private int seekLength; private int overlapLength; private float[] pMidBuffer; private float[] pRefMidBuffer; private float[] outputFloatBuffer; private int intskip; private int sampleReq; private double tempo; private AudioDispatcher dispatcher; private Parameters newParameters; /** * Create a new instance based on algorithm parameters for a certain audio format. * @param params The parameters for the algorithm. */ public WaveformSimilarityBasedOverlapAdd(Parameters params){ setParameters(params); applyNewParameters(); } public void setParameters(Parameters params){ newParameters = params; } public void setDispatcher(AudioDispatcher newDispatcher){ this.dispatcher = newDispatcher; } private void applyNewParameters(){ Parameters params = newParameters; int oldOverlapLength = overlapLength; overlapLength = (int) ((params.getSampleRate() * params.getOverlapMs())/1000); seekWindowLength = (int) ((params.getSampleRate() * params.getSequenceMs())/1000); seekLength = (int) ((params.getSampleRate() * params.getSeekWindowMs())/1000); tempo = params.getTempo(); //pMidBuffer and pRefBuffer are initialized with 8 times the needed length to prevent a reset //of the arrays when overlapLength changes. if(overlapLength > oldOverlapLength * 8 && pMidBuffer==null){ pMidBuffer = new float[overlapLength * 8]; //overlapLengthx2? pRefMidBuffer = new float[overlapLength * 8];//overlapLengthx2? System.out.println("New overlapLength" + overlapLength); } double nominalSkip = tempo * (seekWindowLength - overlapLength); intskip = (int) (nominalSkip + 0.5); sampleReq = Math.max(intskip + overlapLength, seekWindowLength) + seekLength; float[] prevOutputBuffer = outputFloatBuffer; outputFloatBuffer = new float[getOutputBufferSize()]; if(prevOutputBuffer!=null){ System.out.println("Copy outputFloatBuffer contents"); for(int i = 0 ; i < prevOutputBuffer.length && i < outputFloatBuffer.length ; i++){ outputFloatBuffer[i] = prevOutputBuffer[i]; } } newParameters = null; } public int getInputBufferSize(){ return sampleReq; } private int getOutputBufferSize(){ return seekWindowLength - overlapLength; } public int getOverlap(){ return sampleReq-intskip; } /** * Overlaps the sample in output with the samples in input. * @param output The output buffer. * @param input The input buffer. */ private void overlap(final float[] output, int outputOffset, float[] input,int inputOffset){ for(int i = 0 ; i < overlapLength ; i++){ int itemp = overlapLength - i; output[i + outputOffset] = (input[i + inputOffset] * i + pMidBuffer[i] * itemp ) / overlapLength; } } /** * Seeks for the optimal overlap-mixing position. * * The best position is determined as the position where the two overlapped * sample sequences are 'most alike', in terms of the highest * cross-correlation value over the overlapping period * * @param inputBuffer The input buffer * @param postion The position where to start the seek operation, in the input buffer. * @return The best position. */ private int seekBestOverlapPosition(float[] inputBuffer, int postion) { int bestOffset; double bestCorrelation, currentCorrelation; int tempOffset; int comparePosition; // Slopes the amplitude of the 'midBuffer' samples precalcCorrReferenceMono(); bestCorrelation = -10; bestOffset = 0; // Scans for the best correlation value by testing each possible // position // over the permitted range. for (tempOffset = 0; tempOffset < seekLength; tempOffset++) { comparePosition = postion + tempOffset; // Calculates correlation value for the mixing position // corresponding // to 'tempOffset' currentCorrelation = (double) calcCrossCorr(pRefMidBuffer, inputBuffer,comparePosition); // heuristic rule to slightly favor values close to mid of the // range double tmp = (double) (2 * tempOffset - seekLength) / seekLength; currentCorrelation = ((currentCorrelation + 0.1) * (1.0 - 0.25 * tmp * tmp)); // Checks for the highest correlation value if (currentCorrelation > bestCorrelation) { bestCorrelation = currentCorrelation; bestOffset = tempOffset; } } return bestOffset; } /** * Slopes the amplitude of the 'midBuffer' samples so that cross correlation * is faster to calculate. Why is this faster? */ void precalcCorrReferenceMono() { for (int i = 0; i < overlapLength; i++){ float temp = i * (overlapLength - i); pRefMidBuffer[i] = pMidBuffer[i] * temp; } } double calcCrossCorr(float[] mixingPos, float[] compare, int offset){ double corr = 0; double norm = 0; for (int i = 1; i < overlapLength; i ++){ corr += mixingPos[i] * compare[i + offset]; norm += mixingPos[i] * mixingPos[i]; } // To avoid division by zero. if (norm < 1e-8){ norm = 1.0; } return corr / Math.pow(norm,0.5); } @Override public boolean process(AudioEvent audioEvent) { float[] audioFloatBuffer = audioEvent.getFloatBuffer(); assert audioFloatBuffer.length == getInputBufferSize(); //Search for the best overlapping position. int offset = seekBestOverlapPosition(audioFloatBuffer,0); // Mix the samples in the 'inputBuffer' at position of 'offset' with the // samples in 'midBuffer' using sliding overlapping // ... first partially overlap with the end of the previous sequence // (that's in 'midBuffer') overlap(outputFloatBuffer,0,audioFloatBuffer,offset); //copy sequence samples from input to output int sequenceLength = seekWindowLength - 2 * overlapLength; System.arraycopy(audioFloatBuffer, offset + overlapLength, outputFloatBuffer, overlapLength, sequenceLength); // Copies the end of the current sequence from 'inputBuffer' to // 'midBuffer' for being mixed with the beginning of the next // processing sequence and so on System.arraycopy(audioFloatBuffer, offset + sequenceLength + overlapLength, pMidBuffer, 0, overlapLength); assert outputFloatBuffer.length == getOutputBufferSize(); audioEvent.setFloatBuffer(outputFloatBuffer); audioEvent.setOverlap(0); if(newParameters!=null){ applyNewParameters(); dispatcher.setStepSizeAndOverlap(getInputBufferSize(),getOverlap()); } return true; } @Override public void processingFinished() { // NOOP } /** * An object to encapsulate some of the parameters for * WSOLA, together with a couple of practical helper functions. * * @author Joren Six */ public static class Parameters { private final int sequenceMs; private final int seekWindowMs; private final int overlapMs; private final double tempo; private final double sampleRate; /** * @param tempo * The tempo change 1.0 means unchanged, 2.0 is + 100% , 0.5 * is half of the speed. * @param sampleRate * The sample rate of the audio 44.1kHz is common. * @param newSequenceMs * Length of a single processing sequence, in milliseconds. * This determines to how long sequences the original sound * is chopped in the time-stretch algorithm. * * The larger this value is, the lesser sequences are used in * processing. In principle a bigger value sounds better when * slowing down tempo, but worse when increasing tempo and * vice versa. * * Increasing this value reduces computational burden & vice * versa. * @param newSeekWindowMs * Seeking window length in milliseconds for algorithm that * finds the best possible overlapping location. This * determines from how wide window the algorithm may look for * an optimal joining location when mixing the sound * sequences back together. * * The bigger this window setting is, the higher the * possibility to find a better mixing position will become, * but at the same time large values may cause a "drifting" * artifact because consequent sequences will be taken at * more uneven intervals. * * If there's a disturbing artifact that sounds as if a * constant frequency was drifting around, try reducing this * setting. * * Increasing this value increases computational burden & * vice versa. * @param newOverlapMs * Overlap length in milliseconds. When the chopped sound * sequences are mixed back together, to form a continuous * sound stream, this parameter defines over how long period * the two consecutive sequences are let to overlap each * other. * * This shouldn't be that critical parameter. If you reduce * the DEFAULT_SEQUENCE_MS setting by a large amount, you * might wish to try a smaller value on this. * * Increasing this value increases computational burden & * vice versa. */ public Parameters(double tempo, double sampleRate, int newSequenceMs, int newSeekWindowMs, int newOverlapMs) { this.tempo = tempo; this.sampleRate = sampleRate; this.overlapMs = newOverlapMs; this.seekWindowMs = newSeekWindowMs; this.sequenceMs = newSequenceMs; } public static Parameters speechDefaults(double tempo, double sampleRate){ int sequenceMs = 40; int seekWindowMs = 15; int overlapMs = 12; return new Parameters(tempo,sampleRate,sequenceMs, seekWindowMs,overlapMs); } public static Parameters musicDefaults(double tempo, double sampleRate){ int sequenceMs = 82; int seekWindowMs = 28; int overlapMs = 12; return new Parameters(tempo,sampleRate,sequenceMs, seekWindowMs,overlapMs); } public static Parameters slowdownDefaults(double tempo, double sampleRate){ int sequenceMs = 100; int seekWindowMs = 35; int overlapMs = 20; return new Parameters(tempo,sampleRate,sequenceMs, seekWindowMs,overlapMs); } public static Parameters automaticDefaults(double tempo, double sampleRate){ double tempoLow = 0.5; // -50% speed double tempoHigh = 2.0; // +100% speed double sequenceMsLow = 125; //ms double sequenceMsHigh = 50; //ms double sequenceK = ((sequenceMsHigh - sequenceMsLow) / (tempoHigh - tempoLow)); double sequenceC = sequenceMsLow - sequenceK * tempoLow; double seekLow = 25;// ms double seekHigh = 15;// ms double seekK =((seekHigh - seekLow) / (tempoHigh-tempoLow)); double seekC = seekLow - seekK * seekLow; int sequenceMs = (int) (sequenceC + sequenceK * tempo + 0.5); int seekWindowMs = (int) (seekC + seekK * tempo + 0.5); int overlapMs = 12; return new Parameters(tempo,sampleRate,sequenceMs, seekWindowMs,overlapMs); } public double getOverlapMs() { return overlapMs; } public double getSequenceMs() { return sequenceMs; } public double getSeekWindowMs() { return seekWindowMs; } public double getSampleRate() { return sampleRate; } public double getTempo(){ return tempo; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy