core.be.tarsos.dsp.pitch.McLeodPitchMethod Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of TarsosDSP Show documentation
TarsosDSP library fork
The newest version!
/*
*      _______                       _____   _____ _____  
*     |__   __|                     |  __ \ / ____|  __ \ 
*        | | __ _ _ __ ___  ___  ___| |  | | (___ | |__) |
*        | |/ _` | '__/ __|/ _ \/ __| |  | |\___ \|  ___/ 
*        | | (_| | |  \__ \ (_) \__ \ |__| |____) | |     
*        |_|\__,_|_|  |___/\___/|___/_____/|_____/|_|     
*                                                         
* -------------------------------------------------------------
*
* TarsosDSP is developed by Joren Six at IPEM, University Ghent
*  
* -------------------------------------------------------------
*
*  Info: http://0110.be/tag/TarsosDSP
*  Github: https://github.com/JorenSix/TarsosDSP
*  Releases: http://0110.be/releases/TarsosDSP/
*  
*  TarsosDSP includes modified source code by various authors,
*  for credits and info, see README.
* 
*/


/**
 */
package be.tarsos.dsp.pitch;

import java.util.ArrayList;
import java.util.List;

/**
 * 
 * Implementation of The McLeod Pitch Method (MPM). It is described in the
 * article A Smarter Way to Find Pitch. According to the article:
 * 
 *  
 * 
 * A fast, accurate and robust method for finding the continuous pitch in
 * monophonic musical sounds. [It uses] a special normalized version of the
 * Squared Difference Function (SDF) coupled with a peak picking algorithm.
 * 
 * 
 * MPM runs in real time with a standard 44.1 kHz sampling rate. It operates
 * without using low-pass filtering so it can work on sound with high harmonic
 * frequencies such as a violin and it can display pitch changes of one cent
 * reliably. MPM works well without any post processing to correct the pitch.
 * 
 *  
 * 
 * For the moment this implementation uses the inefficient way of calculating
 * the pitch. It uses O(Ww) with W the window size in samples and w
 * the desired number of ACF coefficients. The implementation can be optimized
 * to O((W+w)log(W+w)) by using an FFT to calculate the ACF. But I am still afraid of the
 * dark magic of the FFT and clinging to the familiar, friendly, laggard time
 * domain.
 * 
 * 
 * @author Phillip McLeod
 * @author Joren Six
 */
public final class McLeodPitchMethod implements PitchDetector {

	/**
	 * The expected size of an audio buffer (in samples).
	 */
	public static final int DEFAULT_BUFFER_SIZE = 1024;

	/**
	 * Overlap defines how much two audio buffers following each other should
	 * overlap (in samples). 75% overlap is advised in the MPM article.
	 */
	public static final int DEFAULT_OVERLAP = 768;

	/**
	 * Defines the relative size the chosen peak (pitch) has. 0.93 means: choose
	 * the first peak that is higher than 93% of the highest peak detected. 93%
	 * is the default value used in the Tartini user interface.
	 */
	private static final double DEFAULT_CUTOFF = 0.97;
	/**
	 * For performance reasons, peaks below this cutoff are not even considered.
	 */
	private static final double SMALL_CUTOFF = 0.5;

	/**
	 * Pitch annotations below this threshold are considered invalid, they are
	 * ignored.
	 */
	private static final double LOWER_PITCH_CUTOFF = 80.0; // Hz

	/**
	 * Defines the relative size the chosen peak (pitch) has.
	 */
	private final double cutoff;

	/**
	 * The audio sample rate. Most audio has a sample rate of 44.1kHz.
	 */
	private final float sampleRate;

	/**
	 * Contains a normalized square difference function value for each delay
	 * (tau).
	 */
	private final float[] nsdf;

	/**
	 * The x and y coordinate of the top of the curve (nsdf).
	 */
	private float turningPointX, turningPointY;

	/**
	 * A list with minimum and maximum values of the nsdf curve.
	 */
	private final List maxPositions = new ArrayList();

	/**
	 * A list of estimates of the period of the signal (in samples).
	 */
	private final List periodEstimates = new ArrayList();

	/**
	 * A list of estimates of the amplitudes corresponding with the period
	 * estimates.
	 */
	private final List ampEstimates = new ArrayList();

	/**
	 * The result of the pitch detection iteration.
	 */
	private final PitchDetectionResult result;

	/**
	 * Initializes the normalized square difference value array and stores the
	 * sample rate.
	 * 
	 * @param audioSampleRate
	 *            The sample rate of the audio to check.
	 */
	public McLeodPitchMethod(final float audioSampleRate) {
		this(audioSampleRate, DEFAULT_BUFFER_SIZE, DEFAULT_CUTOFF);
	}

	/**
	 * Create a new pitch detector.
	 * 
	 * @param audioSampleRate
	 *            The sample rate of the audio.
	 * @param audioBufferSize
	 *            The size of one audio buffer 1024 samples is common.
	 */
	public McLeodPitchMethod(final float audioSampleRate, final int audioBufferSize) {
		this(audioSampleRate, audioBufferSize, DEFAULT_CUTOFF);
	}

	/**
	 * Create a new pitch detector.
	 * 
	 * @param audioSampleRate
	 *            The sample rate of the audio.
	 * @param audioBufferSize
	 *            The size of one audio buffer 1024 samples is common.
	 * @param cutoffMPM
	 *            The cutoff (similar to the YIN threshold). In the Tartini
	 *            paper 0.93 is used.
	 */
	public McLeodPitchMethod(final float audioSampleRate, final int audioBufferSize, final double cutoffMPM) {
		this.sampleRate = audioSampleRate;
		nsdf = new float[audioBufferSize];
		this.cutoff = cutoffMPM;
		result = new PitchDetectionResult();
	}

	/**
	 * Implements the normalized square difference function. See section 4 (and
	 * the explanation before) in the MPM article. This calculation can be
	 * optimized by using an FFT. The results should remain the same.
	 * 
	 * @param audioBuffer
	 *            The buffer with audio information.
	 */
	private void normalizedSquareDifference(final float[] audioBuffer) {
		for (int tau = 0; tau < audioBuffer.length; tau++) {
			float acf = 0;
			float divisorM = 0;
			for (int i = 0; i < audioBuffer.length - tau; i++) {
				acf += audioBuffer[i] * audioBuffer[i + tau];
				divisorM += audioBuffer[i] * audioBuffer[i] + audioBuffer[i + tau] * audioBuffer[i + tau];
			}
			nsdf[tau] = 2 * acf / divisorM;
		}
	}

	/*
	 * (non-Javadoc)
	 * 
	 * @see be.tarsos.pitch.pure.PurePitchDetector#getPitch(float[])
	 */
	public PitchDetectionResult getPitch(final float[] audioBuffer) {
		final float pitch;

		// 0. Clear previous results (Is this faster than initializing a list
		// again and again?)
		maxPositions.clear();
		periodEstimates.clear();
		ampEstimates.clear();

		// 1. Calculate the normalized square difference for each Tau value.
		normalizedSquareDifference(audioBuffer);
		// 2. Peak picking time: time to pick some peaks.
		peakPicking();

		double highestAmplitude = Double.NEGATIVE_INFINITY;

		for (final Integer tau : maxPositions) {
			// make sure every annotation has a probability attached
			highestAmplitude = Math.max(highestAmplitude, nsdf[tau]);

			if (nsdf[tau] > SMALL_CUTOFF) {
				// calculates turningPointX and Y
				parabolicInterpolation(tau);
				// store the turning points
				ampEstimates.add(turningPointY);
				periodEstimates.add(turningPointX);
				// remember the highest amplitude
				highestAmplitude = Math.max(highestAmplitude, turningPointY);
			}
		}

		if (periodEstimates.isEmpty()) {
			pitch = -1;
		} else {
			// use the overall maximum to calculate a cutoff.
			// The cutoff value is based on the highest value and a relative
			// threshold.
			final double actualCutoff = cutoff * highestAmplitude;

			// find first period above or equal to cutoff
			int periodIndex = 0;
			for (int i = 0; i < ampEstimates.size(); i++) {
				if (ampEstimates.get(i) >= actualCutoff) {
					periodIndex = i;
					break;
				}
			}

			final double period = periodEstimates.get(periodIndex);
			final float pitchEstimate = (float) (sampleRate / period);
			if (pitchEstimate > LOWER_PITCH_CUTOFF) {
				pitch = pitchEstimate;
			} else {
				pitch = -1;
			}

		}
		result.setProbability((float) highestAmplitude);
		result.setPitch(pitch);
		result.setPitched(pitch != -1);
		
		return result;
	}

	/**
	 * 
	 * Finds the x value corresponding with the peak of a parabola.
	 * 
	 * 
	 * a,b,c are three samples that follow each other. E.g. a is at 511, b at
	 * 512 and c at 513; f(a), f(b) and f(c) are the normalized square
	 * difference values for those samples; x is the peak of the parabola and is
	 * what we are looking for. Because the samples follow each other
	 * b - a = 1 the formula for parabolic interpolation
	 * can be simplified a lot.
	 * 
	 * 
	 * The following ASCII ART shows it a bit more clear, imagine this to be a
	 * bit more curvaceous.
	 * 
	 * 
	 * 	 *     nsdf(x)
	 *       ^
	 *       |
	 * f(x)  |------ ^
	 * f(b)  |     / |\
	 * f(a)  |    /  | \
	 *       |   /   |  \
	 *       |  /    |   \
	 * f(c)  | /     |    \
	 *       |_____________________> x
	 *            a  x b  c
	 * 
	 * 
	 * @param tau
	 *            The delay tau, b value in the drawing is the tau value.
	 */
	private void parabolicInterpolation(final int tau) {
		final float nsdfa = nsdf[tau - 1];
		final float nsdfb = nsdf[tau];
		final float nsdfc = nsdf[tau + 1];
		final float bValue = tau;
		final float bottom = nsdfc + nsdfa - 2 * nsdfb;
		if (bottom == 0.0) {
			turningPointX = bValue;
			turningPointY = nsdfb;
		} else {
			final float delta = nsdfa - nsdfc;
			turningPointX = bValue + delta / (2 * bottom);
			turningPointY = nsdfb - delta * delta / (8 * bottom);
		}
	}

	/**
	 * 
	 * Implementation based on the GPL'ED code of Tartini This code can be found in the file
	 * general/mytransforms.cpp.
	 * 
	 * 
	 * Finds the highest value between each pair of positive zero crossings.
	 * Including the highest value between the last positive zero crossing and
	 * the end (if any). Ignoring the first maximum (which is at zero). In this
	 * diagram the desired values are marked with a +
	 * 
	 * 
	 * 	 *  f(x)
	 *   ^
	 *   |
	 *  1|               +
	 *   | \      +     /\      +     /\
	 *  0| _\____/\____/__\/\__/\____/_______> x
	 *   |   \  /  \  /      \/  \  /
	 * -1|    \/    \/            \/
	 *   |
	 * 
	 * 
	 * @param nsdf
	 *            The array to look for maximum values in. It should contain
	 *            values between -1 and 1
	 * @author Phillip McLeod
	 */
	private void peakPicking() {

		int pos = 0;
		int curMaxPos = 0;

		// find the first negative zero crossing
		while (pos < (nsdf.length - 1) / 3 && nsdf[pos] > 0) {
			pos++;
		}

		// loop over all the values below zero
		while (pos < nsdf.length - 1 && nsdf[pos] <= 0.0) {
			pos++;
		}

		// can happen if output[0] is NAN
		if (pos == 0) {
			pos = 1;
		}

		while (pos < nsdf.length - 1) {
			assert nsdf[pos] >= 0;
			if (nsdf[pos] > nsdf[pos - 1] && nsdf[pos] >= nsdf[pos + 1]) {
				if (curMaxPos == 0) {
					// the first max (between zero crossings)
					curMaxPos = pos;
				} else if (nsdf[pos] > nsdf[curMaxPos]) {
					// a higher max (between the zero crossings)
					curMaxPos = pos;
				}
			}
			pos++;
			// a negative zero crossing
			if (pos < nsdf.length - 1 && nsdf[pos] <= 0) {
				// if there was a maximum add it to the list of maxima
				if (curMaxPos > 0) {
					maxPositions.add(curMaxPos);
					curMaxPos = 0; // clear the maximum position, so we start
					// looking for a new ones
				}
				while (pos < nsdf.length - 1 && nsdf[pos] <= 0.0f) {
					pos++; // loop over all the values below zero
				}
			}
		}
		if (curMaxPos > 0) { // if there was a maximum in the last part
			maxPositions.add(curMaxPos); // add it to the vector of maxima
		}
	}
}