de.citec.tcs.alignment.AbstractStrictAlignmentAlgorithm Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of algorithms-lib Show documentation
This module containts standard implementations of AlignmentAlgorithms. In contrast to the adp module these implementations are hand-tailored for some specific algorithms and thus achieve somewhat faster runtime (a constant factor of maybe 30-50 percent).
The newest version!
/* 
 * TCS Alignment Toolbox Version 3
 * 
 * Copyright (C) 2016
 * Benjamin Paaßen
 * AG Theoretical Computer Science
 * Centre of Excellence Cognitive Interaction Technology (CITEC)
 * University of Bielefeld
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see .
 */
package de.citec.tcs.alignment;

import de.citec.tcs.alignment.comparators.Comparator;
import de.citec.tcs.alignment.comparators.OperationType;
import java.util.List;
import lombok.NonNull;

/**
 * This is an abstract super class implementing the Needleman-Wunsch-Algorithm or
 * Wagner-Fischer-Algorithm to compute a standard edit distance, also called Levenshtein-distance,
 * on input sequences.
 *
 * See also: http://en.wikipedia.org/wiki/Edit_distance
 *
 * Given two sequences (x_1, ... , x_M) ∈ X* and (y_1, ... , y_N) ∈ Y^* as well as a
 * Comparator d, this distance D is defined via four (recursive) equations:
 *
 * 
 * D(epsilon, epsilon) := 0
 * D((x_1, ..., x_i) , epsilon) := D((x_i, ... , x_i-1) , epsilon) + d(x_i, null)
 * D(epsilon, (y_1, ... , y_j)) := D(epsilon , (y_1, ... , y_j-1)) + d(null, y_j)
 * D((x_1, ..., x_i), (y_1, ... , y_j)) := min{
 * D((x_i, ... , x_i-1) , (y_1, ... , y_j)) + d(x_i, null) ,
 * D((x_1, ..., x_i) , (y_1, ... , y_j-1)) + d(null, y_j) ,
 * D((x_i, ... , x_i-1) , (y_1, ... , y_j-1)) + d(x_i, y_j)
 * }
 * 
 *
 * Note that in the worst case d(x_i, y_j) may return 1 for any input (as per definition of a
 * Comparator). In that case the minimum edit distance is given by replacing all elements in the
 * left input sequence with all elements in the right input sequence and replacing the rest with
 * gaps. Thus, the worst-case alignment distance is max{M, N}. In order to obtain an alignment
 * distance between 0 and 1 we can just return D as specified above, divided by max{M, N}. Note that
 * this implies that the triangular inequality does not hold anymore.
 *
 * @author Benjamin Paassen - [email protected]
 * @param  the class of the elements in the left input sequence.
 * @param  the class of the elements in the right input sequence.
 * @param  The result class.
 */
public abstract class AbstractStrictAlignmentAlgorithm implements AlignmentAlgorithm {

	private Comparator comparator;
	private final Class resultClass;
	private double[][] lastAlignmentMatrix;

	public AbstractStrictAlignmentAlgorithm(@NonNull Comparator comparator,
			@NonNull Class resultClass) {
		ComparatorValidator.validate(this, comparator);
		this.comparator = comparator;
		this.resultClass = resultClass;
	}

	@Override
	public Class getResultClass() {
		return resultClass;
	}

	@Override
	public Comparator getComparator() {
		return comparator;
	}

	@Override
	public void setComparator(@NonNull Comparator comparator) {
		ComparatorValidator.validate(this, comparator);
		this.comparator = comparator;
	}

	/**
	 * The last matrix that was calculated using this algorithm.
	 *
	 * @return last matrix that was calculated using this algorithm.
	 */
	public double[][] getLastAlignmentMatrix() {
		return lastAlignmentMatrix;
	}

	@Override
	public boolean requires(@NonNull OperationType type) {
		switch (type) {
			case DELETION:
			case INSERTION:
			case REPLACEMENT:
				return true;
			default:
				return false;
		}
	}

	@Override
	public R calculateAlignment(@NonNull final List a, @NonNull final List b) {

		final int m = a.size();
		final int n = b.size();

		// store the operation costs to re-use that information
		final double[] delCosts = new double[m];
		for (int i = 0; i < m; i++) {
			delCosts[i] = comparator.compare(OperationType.DELETION, a.get(i), null);
		}
		final double[] insCosts = new double[n];
		for (int j = 0; j < n; j++) {
			insCosts[j] = comparator.compare(OperationType.INSERTION, null, b.get(j));
		}
		final double[][] repCosts = new double[m][n];
		for (int i = 0; i < m; i++) {
			final X x = a.get(i);
			for (int j = 0; j < n; j++) {
				repCosts[i][j] = comparator.compare(OperationType.REPLACEMENT, x, b.get(j));
			}
		}

		//initialize the alignment matrix.
		final double[][] alignMat = new double[m + 1][n + 1];

		//initialize first column, which means the deletion of the entire sequence a.
		for (int i = 1; i <= m; i++) {
			alignMat[i][0] = alignMat[i - 1][0] + delCosts[i - 1];
		}
		//initialize the first row, which means the insertion of the entire sequence b.
		for (int j = 1; j <= n; j++) {
			alignMat[0][j] = alignMat[0][j - 1] + insCosts[j - 1];
		}

		//now start the alignment.
		for (int i = 1; i <= m; i++) {
			for (int j = 1; j <= n; j++) {

				//calculate the new alignment matrix entry.
				alignMat[i][j] = Math.min(
						alignMat[i - 1][j - 1] + repCosts[i - 1][j - 1],
						Math.min(
								alignMat[i - 1][j] + delCosts[i - 1],
								alignMat[i][j - 1] + insCosts[j - 1]
						)
				);
			}
		}
		final R result = transformToResult(alignMat, repCosts, delCosts, insCosts, a, b);
		lastAlignmentMatrix = alignMat;
		return result;
	}

	/**
	 * This method has to be implemented by sub classes to transform a calculated dynamic
	 * programming matrix to a valid result of that implementation. This also has to implement the
	 * backtracing if necessary.
	 *
	 * @param alignmentMatrix a dynamic programming matrix calculated with respect to both input
	 * sequences.
	 * @param repCosts the matrix of pairwise REPLACEMENT costs for each pairwise combination of
	 * elements in the input sequences.
	 * @param delCosts the vector of DELETION costs for each element of the left input sequence.
	 * @param insCosts the vector of INSERTION costs for each element of the right input sequence.
	 * @param a the first input sequence.
	 * @param b the second input sequence.
	 *
	 * @return a valid result for this algorithm implementation.
	 */
	public abstract R transformToResult(@NonNull double[][] alignmentMatrix,
			@NonNull double[][] repCosts, @NonNull double[] delCosts, @NonNull double[] insCosts,
			@NonNull final List a, @NonNull final List b);

	/**
	 * Normalizes the given raw distance by the worst case that could occur in an alignment of the
	 * two sequences: In the worst case, we replace all elements in a with elements in b and
	 * delete/insert the remaining elements in the longer sequence. The cost of those operations can
	 * be 1 at worst, if the Comparator is properly normalized. Thus, assuming sequence lengths m
	 * and n respectively, we divide the raw distance by max{m, n} for normalization.
	 *
	 * @param  the class of elements in the first input sequence.
	 * @param  the class of elements in the second input sequence.
	 * @param d the raw alignment distance between sequences a and b in the range [0,infinity)
	 * @param a the left-hand input sequence.
	 * @param b the right-hand input sequence.
	 *
	 * @return the normalized alignment distance between sequences a and b in the range [0,1].
	 */
	public static  double normalizeDissimilarity(double d, @NonNull final List a, @NonNull final List b) {
		final int normalization = Math.max(a.size(), b.size());
		if (normalization == 0) {
			if (d != 0) {
				throw new IllegalArgumentException("Unexpected internal state: "
						+ "Two aligned empty sequnces lead to a dissimilarity "
						+ "other than 0.");
			}
			return 0;
		} else {
			return d / normalization;
		}
	}
}