de.citec.tcs.alignment.AbstractStrictDTWAlgorithm Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of algorithms-lib Show documentation
This module containts standard implementations of AlignmentAlgorithms. In contrast to the adp module these implementations are hand-tailored for some specific algorithms and thus achieve somewhat faster runtime (a constant factor of maybe 30-50 percent).
The newest version!
/* 
 * TCS Alignment Toolbox Version 3
 * 
 * Copyright (C) 2016
 * Benjamin Paaßen
 * AG Theoretical Computer Science
 * Centre of Excellence Cognitive Interaction Technology (CITEC)
 * University of Bielefeld
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see .
 */
package de.citec.tcs.alignment;

import de.citec.tcs.alignment.comparators.Comparator;
import de.citec.tcs.alignment.comparators.OperationType;
import java.util.List;
import lombok.NonNull;

/**
 * This is an abstract super class implementing the dynamic time warping (DTW) dissimilarity.
 *
 * See also: https://en.wikipedia.org/wiki/Dynamic_time_warping
 *
 * Given two sequences (x_1, ... , x_M) ∈ X* and (y_1, ... , y_N) ∈ Y^* as well as a
 * Comparator d, this distance D is defined via four (recursive) equations:
 *
 * 
 * D((x_1), (y_1)) := d(x_1, y_1)
 * D((x_1, ..., x_i) , (y_1)) := D((x_i, ... , x_i-1) , (y_1)) + d(x_i, y_1)
 * D((x_1), (y_1, ... , y_j)) := D((x_1) , (y_1, ... , y_j-1)) + d(x_1, y_j)
 * D((x_1, ..., x_i), (y_1, ... , y_j)) := min{
 * D((x_i, ... , x_i-1) , (y_1, ... , y_j)) + d(x_i, y_j) ,
 * D((x_1, ..., x_i) , (y_1, ... , y_j-1)) + d(x_i, y_j) ,
 * D((x_i, ... , x_i-1) , (y_1, ... , y_j-1)) + d(x_i, y_j)
 * }
 * 
 *
 * Equations (2) and (3) as well as the first two minimum contributions in equation (4) correspond
 * to prolonging one of the input sequences. This construction also implies that the
 * triangular inequality does not hold for the DTW distance.
 *
 * Note that d(x_i, y_j) contributes to any element in the minimum set in equation (4) such that, in
 * principle, we can compute it outside of the minimum. However, this does only hold if the
 * comparator d is consistent across the three operations REPLACEMENT, DELETIONREPLACEMENT and
 * INSERTIONREPLACEMENT.
 *
 * Further note that in the worst case d(x_i, y_j) may return 1 for any input (as per definition of
 * a Comparator). In that case the minimum DTW distance is given by replacing all elements in the
 * left input sequence with all elements in the right input sequence and replacing the rest with
 * gaps. Thus, the worst-case alignment distance is max{M, N}. In order to obtain an alignment
 * distance between 0 and 1 we can just return D as specified above, divided by max{M, N}.
 *
 * @author Benjamin Paassen - [email protected]
 * @param  the class of the elements in the left input sequence.
 * @param  the class of the elements in the right input sequence.
 * @param  The result class.
 */
public abstract class AbstractStrictDTWAlgorithm implements AlignmentAlgorithm {

	private final Class resultClass;
	private Comparator comparator;
	private double[][] lastDTWMatrix;

	public AbstractStrictDTWAlgorithm(@NonNull Class resultClass, @NonNull Comparator comparator) {
		this.resultClass = resultClass;
		ComparatorValidator.validate(this, comparator);
		this.comparator = comparator;
	}

	@Override
	public Class getResultClass() {
		return resultClass;
	}

	@Override
	public Comparator getComparator() {
		return comparator;
	}

	@Override
	public void setComparator(@NonNull Comparator comparator) {
		ComparatorValidator.validate(this, comparator);
		this.comparator = comparator;
	}

	@Override
	public boolean requires(@NonNull OperationType type) {
		switch (type) {
			case REPLACEMENT:
			case DELETIONREPLACEMENT:
			case INSERTIONREPLACEMENT:
				return true;
			default:
				return false;
		}
	}

	/**
	 * This returns the dynamic programming matrix that was calculated in the last call of
	 * calculateAlignment.
	 *
	 * @return the dynamic programming matrix that was calculated in the last call of
	 * calculateAlignment.
	 */
	public double[][] getLastDTWMatrix() {
		return lastDTWMatrix;
	}

	@Override
	public R calculateAlignment(@NonNull List a, @NonNull List b) {

		if (a.isEmpty()) {
			throw new IllegalArgumentException("The first given sequence is empty!");
		}
		if (b.isEmpty()) {
			throw new IllegalArgumentException("The second given sequence is emtpy!");
		}

		final int m = a.size();
		final int n = b.size();

		/*
		 * Pre-cache the replacement costs.
		 */
		final double[][] repCosts = new double[m][n];
		final double[][] delRepCosts;
		final double[][] insRepCosts;
		if (getComparator().hasCoherentReplacementCost()) {
			delRepCosts = repCosts;
			insRepCosts = repCosts;
		} else {
			delRepCosts = new double[m][n];
			insRepCosts = new double[m][n];
		}
		{
			X x;
			Y y;
			for (int i = 0; i < m; i++) {
				x = a.get(i);
				for (int j = 0; j < n; j++) {
					y = b.get(j);

					repCosts[i][j] = getComparator().compare(OperationType.REPLACEMENT, x, y);
					if (!getComparator().hasCoherentReplacementCost()) {
						delRepCosts[i][j] = getComparator().compare(OperationType.DELETIONREPLACEMENT, x, y);
						insRepCosts[i][j] = getComparator().compare(OperationType.INSERTIONREPLACEMENT, x, y);
					}
				}
			}
		}

		final double[][] dtwMatrix = new double[m][n];
		/*
		 * initialize the alignment matrix. Note that I do not use the classic
		 * trick to initialize the first row and column with infinity. First
		 * this blows up the matrix by a linear summand, second it is an
		 * unnecessary workaround from my perspective, because you can exploit
		 * some nice properties of the second row and column (which I treat as
		 * the first one) to initialize it directly.
		 */

		//initialize the first entry, which is just the comparison of the first
		//entries of both sequences. Note that the sequences are not allowed to be empty.
		dtwMatrix[0][0] = repCosts[0][0];

		//initialize first column. Here we can only elongate the first sequence
		//until the second one ends.
		for (int i = 1; i < m; i++) {
			dtwMatrix[i][0] = dtwMatrix[i - 1][0] + delRepCosts[i][0];
		}
		//initialize the first row. Here we can only elongate the second sequence
		//until the first one ends.
		for (int j = 1; j < n; j++) {
			dtwMatrix[0][j] = dtwMatrix[0][j - 1] + insRepCosts[0][j];
		}

		/*
		 * use the actual DTW algorithm
		 */
		for (int i = 1; i < m; i++) {
			for (int j = 1; j < n; j++) {
				//calculate the new DTW matrix entry.
				dtwMatrix[i][j] = Math.min(
						dtwMatrix[i - 1][j - 1] + repCosts[i][j],
						Math.min(
								dtwMatrix[i - 1][j] + delRepCosts[i][j],
								dtwMatrix[i][j - 1] + insRepCosts[i][j]
						)
				);
			}
		}
		lastDTWMatrix = dtwMatrix;
		return transformToResult(dtwMatrix, repCosts, delRepCosts, insRepCosts, a, b);
	}

	/**
	 * This method has to be implemented by sub classes to transform a calculated dynamic
	 * programming matrix to a valid result of that implementation. This also has to implement the
	 * backtracing if necessary.
	 *
	 * @param dtwMatrix a dynamic programming matrix calculated with respect to both input
	 * sequences.
	 * @param repCosts the matrix of pairwise REPLACEMENT costs for each pairwise combination of
	 * elements in the input sequences.
	 * @param delRepCosts the matrix of pairwise DELETIONREPLACEMENT costs for each pairwise
	 * combination of elements in the input sequences.
	 * @param insRepCosts the matrix of pairwise INSERTIONREPLACEMENT costs for each pairwise
	 * combination of elements in the input sequences.
	 * @param a the first input sequence.
	 * @param b the second input sequence.
	 *
	 * @return a valid result for this algorithm implementation.
	 */
	public abstract R transformToResult(@NonNull double[][] dtwMatrix,
			@NonNull double[][] repCosts, @NonNull double[][] delRepCosts, @NonNull double[][] insRepCosts,
			@NonNull final List a, @NonNull final List b);

	/**
	 * Normalizes the given raw distance by the worst case that could occur in an alignment of the
	 * two sequences: In the worst case, we replace all elements in a with elements in b and
	 * delete-replace/insert-replace the remaining elements in the longer sequence. The cost of
	 * those operations can be 1 at worst, if the Comparator is properly normalized. Thus, assuming
	 * sequence lengths m and n respectively, we divide the raw distance by max{m, n} for
	 * normalization.
	 *
	 * @param  the class of elements in the first input sequence.
	 * @param  the class of elements in the second input sequence.
	 * @param d the raw alignment distance between sequences a and b in the range [0,infinity)
	 * @param a the left-hand input sequence.
	 * @param b the right-hand input sequence.
	 *
	 * @return the normalized alignment distance between sequences a and b in the range [0,1].
	 */
	public static  double normalizeDissimilarity(double d, @NonNull final List a, @NonNull final List b) {
		final int normalization = Math.max(a.size(), b.size());
		if (normalization == 0) {
			if (d != 0) {
				throw new IllegalArgumentException("Unexpected internal state: "
						+ "Two aligned empty sequnces lead to a dissimilarity "
						+ "other than 0.");
			}
			return 0;
		} else {
			return d / normalization;
		}
	}
}