
de.citec.tcs.alignment.AbstractStrictAlignmentAlgorithm Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of algorithms-lib Show documentation
Show all versions of algorithms-lib Show documentation
This module containts standard implementations of
AlignmentAlgorithms. In contrast to the adp module these implementations
are hand-tailored for some specific algorithms and thus achieve somewhat
faster runtime (a constant factor of maybe 30-50 percent).
The newest version!
/*
* TCS Alignment Toolbox Version 3
*
* Copyright (C) 2016
* Benjamin Paaßen
* AG Theoretical Computer Science
* Centre of Excellence Cognitive Interaction Technology (CITEC)
* University of Bielefeld
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*/
package de.citec.tcs.alignment;
import de.citec.tcs.alignment.comparators.Comparator;
import de.citec.tcs.alignment.comparators.OperationType;
import java.util.List;
import lombok.NonNull;
/**
* This is an abstract super class implementing the Needleman-Wunsch-Algorithm or
* Wagner-Fischer-Algorithm to compute a standard edit distance, also called Levenshtein-distance,
* on input sequences.
*
* See also: http://en.wikipedia.org/wiki/Edit_distance
*
* Given two sequences (x_1, ... , x_M) ∈ X* and (y_1, ... , y_N) ∈ Y^* as well as a
* Comparator d, this distance D is defined via four (recursive) equations:
*
*
* - D(epsilon, epsilon) := 0
* - D((x_1, ..., x_i) , epsilon) := D((x_i, ... , x_i-1) , epsilon) + d(x_i, null)
* - D(epsilon, (y_1, ... , y_j)) := D(epsilon , (y_1, ... , y_j-1)) + d(null, y_j)
* - D((x_1, ..., x_i), (y_1, ... , y_j)) := min{
* D((x_i, ... , x_i-1) , (y_1, ... , y_j)) + d(x_i, null) ,
* D((x_1, ..., x_i) , (y_1, ... , y_j-1)) + d(null, y_j) ,
* D((x_i, ... , x_i-1) , (y_1, ... , y_j-1)) + d(x_i, y_j)
* }
*
*
* Note that in the worst case d(x_i, y_j) may return 1 for any input (as per definition of a
* Comparator). In that case the minimum edit distance is given by replacing all elements in the
* left input sequence with all elements in the right input sequence and replacing the rest with
* gaps. Thus, the worst-case alignment distance is max{M, N}. In order to obtain an alignment
* distance between 0 and 1 we can just return D as specified above, divided by max{M, N}. Note that
* this implies that the triangular inequality does not hold anymore.
*
* @author Benjamin Paassen - [email protected]
* @param the class of the elements in the left input sequence.
* @param the class of the elements in the right input sequence.
* @param The result class.
*/
public abstract class AbstractStrictAlignmentAlgorithm implements AlignmentAlgorithm {
private Comparator comparator;
private final Class resultClass;
private double[][] lastAlignmentMatrix;
public AbstractStrictAlignmentAlgorithm(@NonNull Comparator comparator,
@NonNull Class resultClass) {
ComparatorValidator.validate(this, comparator);
this.comparator = comparator;
this.resultClass = resultClass;
}
@Override
public Class getResultClass() {
return resultClass;
}
@Override
public Comparator getComparator() {
return comparator;
}
@Override
public void setComparator(@NonNull Comparator comparator) {
ComparatorValidator.validate(this, comparator);
this.comparator = comparator;
}
/**
* The last matrix that was calculated using this algorithm.
*
* @return last matrix that was calculated using this algorithm.
*/
public double[][] getLastAlignmentMatrix() {
return lastAlignmentMatrix;
}
@Override
public boolean requires(@NonNull OperationType type) {
switch (type) {
case DELETION:
case INSERTION:
case REPLACEMENT:
return true;
default:
return false;
}
}
@Override
public R calculateAlignment(@NonNull final List a, @NonNull final List b) {
final int m = a.size();
final int n = b.size();
// store the operation costs to re-use that information
final double[] delCosts = new double[m];
for (int i = 0; i < m; i++) {
delCosts[i] = comparator.compare(OperationType.DELETION, a.get(i), null);
}
final double[] insCosts = new double[n];
for (int j = 0; j < n; j++) {
insCosts[j] = comparator.compare(OperationType.INSERTION, null, b.get(j));
}
final double[][] repCosts = new double[m][n];
for (int i = 0; i < m; i++) {
final X x = a.get(i);
for (int j = 0; j < n; j++) {
repCosts[i][j] = comparator.compare(OperationType.REPLACEMENT, x, b.get(j));
}
}
//initialize the alignment matrix.
final double[][] alignMat = new double[m + 1][n + 1];
//initialize first column, which means the deletion of the entire sequence a.
for (int i = 1; i <= m; i++) {
alignMat[i][0] = alignMat[i - 1][0] + delCosts[i - 1];
}
//initialize the first row, which means the insertion of the entire sequence b.
for (int j = 1; j <= n; j++) {
alignMat[0][j] = alignMat[0][j - 1] + insCosts[j - 1];
}
//now start the alignment.
for (int i = 1; i <= m; i++) {
for (int j = 1; j <= n; j++) {
//calculate the new alignment matrix entry.
alignMat[i][j] = Math.min(
alignMat[i - 1][j - 1] + repCosts[i - 1][j - 1],
Math.min(
alignMat[i - 1][j] + delCosts[i - 1],
alignMat[i][j - 1] + insCosts[j - 1]
)
);
}
}
final R result = transformToResult(alignMat, repCosts, delCosts, insCosts, a, b);
lastAlignmentMatrix = alignMat;
return result;
}
/**
* This method has to be implemented by sub classes to transform a calculated dynamic
* programming matrix to a valid result of that implementation. This also has to implement the
* backtracing if necessary.
*
* @param alignmentMatrix a dynamic programming matrix calculated with respect to both input
* sequences.
* @param repCosts the matrix of pairwise REPLACEMENT costs for each pairwise combination of
* elements in the input sequences.
* @param delCosts the vector of DELETION costs for each element of the left input sequence.
* @param insCosts the vector of INSERTION costs for each element of the right input sequence.
* @param a the first input sequence.
* @param b the second input sequence.
*
* @return a valid result for this algorithm implementation.
*/
public abstract R transformToResult(@NonNull double[][] alignmentMatrix,
@NonNull double[][] repCosts, @NonNull double[] delCosts, @NonNull double[] insCosts,
@NonNull final List a, @NonNull final List b);
/**
* Normalizes the given raw distance by the worst case that could occur in an alignment of the
* two sequences: In the worst case, we replace all elements in a with elements in b and
* delete/insert the remaining elements in the longer sequence. The cost of those operations can
* be 1 at worst, if the Comparator is properly normalized. Thus, assuming sequence lengths m
* and n respectively, we divide the raw distance by max{m, n} for normalization.
*
* @param the class of elements in the first input sequence.
* @param the class of elements in the second input sequence.
* @param d the raw alignment distance between sequences a and b in the range [0,infinity)
* @param a the left-hand input sequence.
* @param b the right-hand input sequence.
*
* @return the normalized alignment distance between sequences a and b in the range [0,1].
*/
public static double normalizeDissimilarity(double d, @NonNull final List a, @NonNull final List b) {
final int normalization = Math.max(a.size(), b.size());
if (normalization == 0) {
if (d != 0) {
throw new IllegalArgumentException("Unexpected internal state: "
+ "Two aligned empty sequnces lead to a dissimilarity "
+ "other than 0.");
}
return 0;
} else {
return d / normalization;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy