boofcv.alg.tracker.circulant.CirculantTracker Maven / Gradle / Ivy

/*
 * Copyright (c) 2021, Peter Abeles. All Rights Reserved.
 *
 * This file is part of BoofCV (http://boofcv.org).
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package boofcv.alg.tracker.circulant;

import boofcv.abst.feature.detect.peak.SearchLocalPeak;
import boofcv.abst.transform.fft.DiscreteFourierTransform;
import boofcv.alg.interpolate.InterpolatePixelS;
import boofcv.alg.misc.PixelMath;
import boofcv.alg.transform.fft.DiscreteFourierTransformOps;
import boofcv.factory.feature.detect.peak.ConfigMeanShiftSearch;
import boofcv.factory.feature.detect.peak.FactorySearchLocalPeak;
import boofcv.misc.BoofMiscOps;
import boofcv.struct.image.GrayF64;
import boofcv.struct.image.ImageGray;
import boofcv.struct.image.InterleavedF64;
import georegression.struct.shapes.RectangleLength2D_F32;

import java.util.Random;

/**
 * 
 * Tracker that uses the theory of Circulant matrices, Discrete Fourier Transform (DCF), and linear classifiers to track
 * a target and learn its changes in appearance [1]. The target is assumed to be rectangular and has fixed size and
 * location. A dense local search is performed around the most recent target location. The search is done quickly
 * using the DCF.
 * 
 *
 * 
 * Tracking is performed using texture information. Since only one description of the target is saved, tracks can
 * drift over time. Tracking performance seems to improve if the object has distinctive edges.
 * 
 *
 * 
 * CHANGES FROM PAPER:

 * 

 * Input image is sampled into a square work region of constant size to improve runtime speed of FFT.
 * Peak of response is found using mean-shift. Provides sub-pixel precision.
 * Pixels outside the image are assigned random values to avoid the tracker from fitting to them. Ideally they
 * wouldn't be processed, but that is complex to implement 
 * 
 * 
 *
 * 
 * [1] Henriques, Joao F., et al. "Exploiting the circulant structure of tracking-by-detection with kernels."
 * Computer Vision–ECCV 2012. Springer Berlin Heidelberg, 2012. 702-715.
 * 
 *
 * @author Peter Abeles
 */
public class CirculantTracker> {

	// --- Tuning parameters
	// spatial bandwidth (proportional to target)
	private final double output_sigma_factor;

	// gaussian kernel bandwidth
	private final double sigma;

	// regularization term
	private final double lambda;
	// linear interpolation term. Adjusts how fast it can learn
	private final double interp_factor;

	// the maximum pixel value
	private final double maxPixelValue;

	// extra padding around the selected region
	private final double padding;

	//----- Internal variables
	// Input image width and height
	private int imageWidth, imageHeight;

	// computes the FFT
	private final DiscreteFourierTransform fft = DiscreteFourierTransformOps.createTransformF64();

	// storage for subimage of input image
	protected GrayF64 templateNew = new GrayF64(1, 1);
	// storage for the subimage of the previous frame
	protected GrayF64 template = new GrayF64(1, 1);

	// cosine window used to reduce artifacts from FFT
	protected GrayF64 cosine = new GrayF64(1, 1);

	// Storage for the kernel's response
	private final GrayF64 k = new GrayF64(1, 1);
	private final InterleavedF64 kf = new InterleavedF64(1, 1, 2);

	// Learn values. used to compute weight in linear classifier
	private final InterleavedF64 alphaf = new InterleavedF64(1, 1, 2);
	private final InterleavedF64 newAlphaf = new InterleavedF64(1, 1, 2);

	// location of target
	protected RectangleLength2D_F32 regionTrack = new RectangleLength2D_F32();
	protected RectangleLength2D_F32 regionOut = new RectangleLength2D_F32();

	// Used for computing the gaussian kernel
	protected GrayF64 gaussianWeight = new GrayF64(1, 1);
	protected InterleavedF64 gaussianWeightDFT = new InterleavedF64(1, 1, 2);

	// detector response
	private final GrayF64 response = new GrayF64(1, 1);

	// storage for storing temporary results
	private final GrayF64 tmpReal0 = new GrayF64(1, 1);
	private final GrayF64 tmpReal1 = new GrayF64(1, 1);

	private final InterleavedF64 tmpFourier0 = new InterleavedF64(1, 1, 2);
	private final InterleavedF64 tmpFourier1 = new InterleavedF64(1, 1, 2);
	private final InterleavedF64 tmpFourier2 = new InterleavedF64(1, 1, 2);

	// interpolation used when sampling input image into work space
	private final InterpolatePixelS interp;

	// used to compute sub-pixel location
	private final SearchLocalPeak localPeak =
			FactorySearchLocalPeak.meanShiftUniform(new ConfigMeanShiftSearch(5, 1e-4), GrayF64.class);

	// adjustment from sub-pixel
	protected float offX, offY;

	// size of the work space in pixels
	private final int workRegionSize;
	// conversion from workspace to image pixels
	private float stepX, stepY;

	// used to fill the area outside of the image with unstructured data.
	private final Random rand = new Random(234);

	/**
	 * Configure tracker
	 *
	 * @param output_sigma_factor spatial bandwidth (proportional to target) Try 1.0/16.0
	 * @param sigma Sigma for Gaussian kernel in linear classifier. Try 0.2
	 * @param lambda Try 1e-2
	 * @param interp_factor Try 0.075
	 * @param padding Padding added around the selected target. Try 1
	 * @param workRegionSize Size of work region. Best if power of 2. Try 64
	 * @param maxPixelValue Maximum pixel value. Typically 255
	 */
	public CirculantTracker( double output_sigma_factor, double sigma, double lambda, double interp_factor,
							 double padding,
							 int workRegionSize,
							 double maxPixelValue,
							 InterpolatePixelS interp ) {
		if (workRegionSize < 3)
			throw new IllegalArgumentException("Minimum size of work region is 3 pixels.");

		this.output_sigma_factor = output_sigma_factor;
		this.sigma = sigma;
		this.lambda = lambda;
		this.interp_factor = interp_factor;
		this.maxPixelValue = maxPixelValue;
		this.interp = interp;

		this.padding = padding;
		this.workRegionSize = workRegionSize;

		resizeImages(workRegionSize);
		computeCosineWindow(cosine);
		computeGaussianWeights(workRegionSize);

		localPeak.setImage(response);
	}

	/**
	 * Initializes tracking around the specified rectangle region
	 *
	 * @param image Image to start tracking from
	 * @param x0 top-left corner of region
	 * @param y0 top-left corner of region
	 * @param regionWidth region's width
	 * @param regionHeight region's height
	 */
	public void initialize( T image, int x0, int y0, int regionWidth, int regionHeight ) {

		this.imageWidth = image.width;
		this.imageHeight = image.height;

		setTrackLocation(x0, y0, regionWidth, regionHeight);

		initialLearning(image);
	}

	/**
	 * Used to change the track's location. If this method is used it is assumed that tracking is active and that
	 * the appearance of the target has not changed
	 *
	 * @param x0 top-left corner of region
	 * @param y0 top-left corner of region
	 * @param regionWidth region's width
	 * @param regionHeight region's height
	 */
	public void setTrackLocation( int x0, int y0, int regionWidth, int regionHeight ) {
		if (imageWidth < regionWidth || imageHeight < regionHeight)
			throw new IllegalArgumentException("Track region is larger than input image: " + regionWidth + " " + regionHeight);

		regionOut.width = regionWidth;
		regionOut.height = regionHeight;

		// adjust for padding
		int w = (int)(regionWidth*(1 + padding));
		int h = (int)(regionHeight*(1 + padding));
		int cx = x0 + regionWidth/2;
		int cy = y0 + regionHeight/2;

		// save the track location
		this.regionTrack.width = w;
		this.regionTrack.height = h;
		this.regionTrack.x0 = cx - w/2;
		this.regionTrack.y0 = cy - h/2;

		stepX = (w - 1)/(float)(workRegionSize - 1);
		stepY = (h - 1)/(float)(workRegionSize - 1);

		updateRegionOut();
	}

	/**
	 * Learn the target's appearance.
	 */
	protected void initialLearning( T image ) {
		// get subwindow at current estimated target position, to train classifier
		get_subwindow(image, template);

		// Kernel Regularized Least-Squares, calculate alphas (in Fourier domain)
		//	k = dense_gauss_kernel(sigma, x);
		dense_gauss_kernel(sigma, template, template, k);
		fft.forward(k, kf);

		// new_alphaf = yf ./ (fft2(k) + lambda);   %(Eq. 7)
		computeAlphas(gaussianWeightDFT, kf, lambda, alphaf);
	}

	/**
	 * Computes the cosine window
	 */
	protected static void computeCosineWindow( GrayF64 cosine ) {
		double[] cosX = new double[cosine.width];
		for (int x = 0; x < cosine.width; x++) {
			cosX[x] = 0.5*(1 - Math.cos(2.0*Math.PI*x/(cosine.width - 1)));
		}
		for (int y = 0; y < cosine.height; y++) {
			int index = cosine.startIndex + y*cosine.stride;
			double cosY = 0.5*(1 - Math.cos(2.0*Math.PI*y/(cosine.height - 1)));
			for (int x = 0; x < cosine.width; x++) {
				cosine.data[index++] = cosX[x]*cosY;
			}
		}
	}

	/**
	 * Computes the weights used in the gaussian kernel
	 *
	 * This isn't actually symmetric for even widths. These weights are used has label in the learning phase. Closer
	 * to one the more likely it is the true target. It should be a peak in the image center. If it is not then
	 * it will learn an incorrect model.
	 */
	protected void computeGaussianWeights( int width ) {
		// desired output (gaussian shaped), bandwidth proportional to target size
		double output_sigma = Math.sqrt(width*width)*output_sigma_factor;

		double left = -0.5/(output_sigma*output_sigma);

		int radius = width/2;

		for (int y = 0; y < gaussianWeight.height; y++) {
			int index = gaussianWeight.startIndex + y*gaussianWeight.stride;

			double ry = y - radius;

			for (int x = 0; x < width; x++) {
				double rx = x - radius;

				gaussianWeight.data[index++] = Math.exp(left*(ry*ry + rx*rx));
			}
		}

		fft.forward(gaussianWeight, gaussianWeightDFT);
	}

	protected void resizeImages( int workRegionSize ) {
		templateNew.reshape(workRegionSize, workRegionSize);
		template.reshape(workRegionSize, workRegionSize);
		cosine.reshape(workRegionSize, workRegionSize);
		k.reshape(workRegionSize, workRegionSize);
		kf.reshape(workRegionSize, workRegionSize);
		alphaf.reshape(workRegionSize, workRegionSize);
		newAlphaf.reshape(workRegionSize, workRegionSize);
		response.reshape(workRegionSize, workRegionSize);
		tmpReal0.reshape(workRegionSize, workRegionSize);
		tmpReal1.reshape(workRegionSize, workRegionSize);
		tmpFourier0.reshape(workRegionSize, workRegionSize);
		tmpFourier1.reshape(workRegionSize, workRegionSize);
		tmpFourier2.reshape(workRegionSize, workRegionSize);
		gaussianWeight.reshape(workRegionSize, workRegionSize);
		gaussianWeightDFT.reshape(workRegionSize, workRegionSize);
	}

	/**
	 * Search for the track in the image and
	 *
	 * @param image Next image in the sequence
	 */
	public void performTracking( T image ) {
		if (image.width != imageWidth || image.height != imageHeight)
			throw new IllegalArgumentException("Tracking image size is not the same as " +
					"input image. Expected " + imageWidth + " x " + imageHeight);
		updateTrackLocation(image);
		if (interp_factor != 0)
			performLearning(image);
	}

	/**
	 * Find the target inside the current image by searching around its last known location
	 */
	protected void updateTrackLocation( T image ) {
		get_subwindow(image, templateNew);

		// calculate response of the classifier at all locations
		// matlab: k = dense_gauss_kernel(sigma, x, z);
		dense_gauss_kernel(sigma, templateNew, template, k);

		fft.forward(k, kf);

		// response = real(ifft2(alphaf .* fft2(k)));   %(Eq. 9)
		DiscreteFourierTransformOps.multiplyComplex(alphaf, kf, tmpFourier0);
		fft.inverse(tmpFourier0, response);

		// find the pixel with the largest response
		int N = response.width*response.height;
		int indexBest = -1;
		double valueBest = -1;
		for (int i = 0; i < N; i++) {
			double v = response.data[i];
			if (v > valueBest) {
				valueBest = v;
				indexBest = i;
			}
		}

		int peakX = indexBest%response.width;
		int peakY = indexBest/response.width;

		// sub-pixel peak estimation
		subpixelPeak(peakX, peakY);

		// peak in region's coordinate system
		float deltaX = (peakX + offX) - templateNew.width/2;
		float deltaY = (peakY + offY) - templateNew.height/2;

		// convert peak location into image coordinate system
		regionTrack.x0 = regionTrack.x0 + deltaX*stepX;
		regionTrack.y0 = regionTrack.y0 + deltaY*stepY;

		updateRegionOut();
	}

	/**
	 * Refine the local-peak using a search algorithm for sub-pixel accuracy.
	 */
	protected void subpixelPeak( int peakX, int peakY ) {
		// this function for r was determined empirically by using work regions of 32,64,128
		int r = Math.min(2, response.width/25);
		if (r < 0)
			return;

		localPeak.setSearchRadius(r);
		localPeak.search(peakX, peakY);

		offX = localPeak.getPeakX() - peakX;
		offY = localPeak.getPeakY() - peakY;
	}

	private void updateRegionOut() {
		regionOut.x0 = (regionTrack.x0 + ((int)regionTrack.width)/2) - ((int)regionOut.width)/2;
		regionOut.y0 = (regionTrack.y0 + ((int)regionTrack.height)/2) - ((int)regionOut.height)/2;
	}

	/**
	 * Update the alphas and the track's appearance
	 */
	public void performLearning( T image ) {
		// use the update track location
		get_subwindow(image, templateNew);

		// Kernel Regularized Least-Squares, calculate alphas (in Fourier domain)
		//	k = dense_gauss_kernel(sigma, x);
		dense_gauss_kernel(sigma, templateNew, templateNew, k);
		fft.forward(k, kf);

		// new_alphaf = yf ./ (fft2(k) + lambda);   %(Eq. 7)
		computeAlphas(gaussianWeightDFT, kf, lambda, newAlphaf);

		// subsequent frames, interpolate model
		// alphaf = (1 - interp_factor) * alphaf + interp_factor * new_alphaf;
		int N = alphaf.width*alphaf.height*2;
		for (int i = 0; i < N; i++) {
			alphaf.data[i] = (1 - interp_factor)*alphaf.data[i] + interp_factor*newAlphaf.data[i];
		}

		// Set the previous image to be an interpolated version
		//		z = (1 - interp_factor) * z + interp_factor * new_z;
		N = templateNew.width*templateNew.height;
		for (int i = 0; i < N; i++) {
			template.data[i] = (1 - interp_factor)*template.data[i] + interp_factor*templateNew.data[i];
		}
	}

	/**
	 * Gaussian Kernel with dense sampling.
	 * Evaluates a gaussian kernel with bandwidth SIGMA for all displacements
	 * between input images X and Y, which must both be MxN. They must also
	 * be periodic (ie., pre-processed with a cosine window). The result is
	 * an MxN map of responses.
	 *
	 * @param sigma Gaussian kernel bandwidth
	 * @param x Input image
	 * @param y Input image
	 * @param k Output containing Gaussian kernel for each element in target region
	 */
	public void dense_gauss_kernel( double sigma, GrayF64 x, GrayF64 y, GrayF64 k ) {

		InterleavedF64 xf = tmpFourier0, yf, xyf = tmpFourier2;
		GrayF64 xy = tmpReal0;
		double yy;

		// find x in Fourier domain
		fft.forward(x, xf);
		double xx = imageDotProduct(x);

		if (x != y) {
			// general case, x and y are different
			yf = tmpFourier1;
			fft.forward(y, yf);
			yy = imageDotProduct(y);
		} else {
			// auto-correlation of x, avoid repeating a few operations
			yf = xf;
			yy = xx;
		}

		//----   xy = invF[ F(x)*F(y) ]
		// cross-correlation term in Fourier domain
		elementMultConjB(xf, yf, xyf);
		// convert to spatial domain
		fft.inverse(xyf, xy);
		circshift(xy, tmpReal1);

		// calculate gaussian response for all positions
		gaussianKernel(xx, yy, tmpReal1, sigma, k);
	}

	public static void circshift( GrayF64 a, GrayF64 b ) {
		int w2 = a.width/2;
		int h2 = b.height/2;

		for (int y = 0; y < a.height; y++) {
			int yy = (y + h2)%a.height;

			for (int x = 0; x < a.width; x++) {
				int xx = (x + w2)%a.width;

				b.set(xx, yy, a.get(x, y));
			}
		}
	}

	/**
	 * Computes the dot product of the image with itself
	 */
	public static double imageDotProduct( GrayF64 a ) {

		double total = 0;

		int N = a.width*a.height;
		for (int index = 0; index < N; index++) {
			double value = a.data[index];
			total += value*value;
		}

		return total;
	}

	/**
	 * Element-wise multiplication of 'a' and the complex conjugate of 'b'
	 */
	public static void elementMultConjB( InterleavedF64 a, InterleavedF64 b, InterleavedF64 output ) {
		for (int y = 0; y < a.height; y++) {

			int index = a.startIndex + y*a.stride;

			for (int x = 0; x < a.width; x++, index += 2) {

				double realA = a.data[index];
				double imgA = a.data[index + 1];
				double realB = b.data[index];
				double imgB = b.data[index + 1];

				output.data[index] = realA*realB + imgA*imgB;
				output.data[index + 1] = -realA*imgB + imgA*realB;
			}
		}
	}

	/**
	 * new_alphaf = yf ./ (fft2(k) + lambda);   %(Eq. 7)
	 */
	protected static void computeAlphas( InterleavedF64 yf, InterleavedF64 kf, double lambda,
										 InterleavedF64 alphaf ) {

		for (int y = 0; y < kf.height; y++) {

			int index = yf.startIndex + y*yf.stride;

			for (int x = 0; x < kf.width; x++, index += 2) {
				double a = yf.data[index];
				double b = yf.data[index + 1];

				double c = kf.data[index] + lambda;
				double d = kf.data[index + 1];

				double bottom = c*c + d*d;

				alphaf.data[index] = (a*c + b*d)/bottom;
				alphaf.data[index + 1] = (b*c - a*d)/bottom;
			}
		}
	}

	/**
	 * Computes the output of the Gaussian kernel for each element in the target region
	 *
	 * k = exp(-1 / sigma^2 * max(0, (xx + yy - 2 * xy) / numel(x)));
	 *
	 * @param xx ||x||^2
	 * @param yy ||y||^2
	 */
	protected static void gaussianKernel( double xx, double yy, GrayF64 xy, double sigma, GrayF64 output ) {
		double sigma2 = sigma*sigma;
		double N = xy.width*xy.height;

		for (int y = 0; y < xy.height; y++) {
			int index = xy.startIndex + y*xy.stride;

			for (int x = 0; x < xy.width; x++, index++) {

				// (xx + yy - 2 * xy) / numel(x)
				double value = (xx + yy - 2*xy.data[index])/N;

				double v = Math.exp(-Math.max(0, value)/sigma2);

				output.data[index] = v;
			}
		}
	}

	/**
	 * Copies the target into the output image and applies the cosine window to it.
	 */
	protected void get_subwindow( T image, GrayF64 output ) {

		// copy the target region

		interp.setImage(image);
		int index = 0;
		for (int y = 0; y < workRegionSize; y++) {
			float yy = regionTrack.y0 + y*stepY;

			for (int x = 0; x < workRegionSize; x++) {
				float xx = regionTrack.x0 + x*stepX;

				if (interp.isInFastBounds(xx, yy))
					output.data[index++] = interp.get_fast(xx, yy);
				else if (BoofMiscOps.isInside(image, xx, yy))
					output.data[index++] = interp.get(xx, yy);
				else {
					// randomize to make pixels outside the image poorly correlate. It will then focus on matching
					// what's inside the image since it has structure
					output.data[index++] = rand.nextFloat()*maxPixelValue;
				}
			}
		}

		// normalize values to be from -0.5 to 0.5
		PixelMath.divide(output, maxPixelValue, output);
		PixelMath.plus(output, -0.5f, output);
		// apply the cosine window to it
		PixelMath.multiply(output, cosine, output);
	}

	/**
	 * The location of the target in the image
	 */
	public RectangleLength2D_F32 getTargetLocation() {
		return regionOut;
	}

	/**
	 * Visual appearance of the target
	 */
	public GrayF64 getTargetTemplate() {
		return template;
	}

	public GrayF64 getResponse() {
		return response;
	}
}