All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.citec.tcs.alignment.AbstractStrictAlignmentAlgorithm Maven / Gradle / Ivy

Go to download

This module containts standard implementations of AlignmentAlgorithms. In contrast to the adp module these implementations are hand-tailored for some specific algorithms and thus achieve somewhat faster runtime (a constant factor of maybe 30-50 percent).

The newest version!
/* 
 * TCS Alignment Toolbox Version 3
 * 
 * Copyright (C) 2016
 * Benjamin Paaßen
 * AG Theoretical Computer Science
 * Centre of Excellence Cognitive Interaction Technology (CITEC)
 * University of Bielefeld
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see .
 */
package de.citec.tcs.alignment;

import de.citec.tcs.alignment.comparators.Comparator;
import de.citec.tcs.alignment.comparators.OperationType;
import java.util.List;
import lombok.NonNull;

/**
 * This is an abstract super class implementing the Needleman-Wunsch-Algorithm or
 * Wagner-Fischer-Algorithm to compute a standard edit distance, also called Levenshtein-distance,
 * on input sequences.
 *
 * See also: http://en.wikipedia.org/wiki/Edit_distance
 *
 * Given two sequences (x_1, ... , x_M) ∈ X* and (y_1, ... , y_N) ∈ Y^* as well as a
 * Comparator d, this distance D is defined via four (recursive) equations:
 *
 * 
    *
  1. D(epsilon, epsilon) := 0
  2. *
  3. D((x_1, ..., x_i) , epsilon) := D((x_i, ... , x_i-1) , epsilon) + d(x_i, null)
  4. *
  5. D(epsilon, (y_1, ... , y_j)) := D(epsilon , (y_1, ... , y_j-1)) + d(null, y_j)
  6. *
  7. D((x_1, ..., x_i), (y_1, ... , y_j)) := min{ * D((x_i, ... , x_i-1) , (y_1, ... , y_j)) + d(x_i, null) , * D((x_1, ..., x_i) , (y_1, ... , y_j-1)) + d(null, y_j) , * D((x_i, ... , x_i-1) , (y_1, ... , y_j-1)) + d(x_i, y_j) * }
  8. *
* * Note that in the worst case d(x_i, y_j) may return 1 for any input (as per definition of a * Comparator). In that case the minimum edit distance is given by replacing all elements in the * left input sequence with all elements in the right input sequence and replacing the rest with * gaps. Thus, the worst-case alignment distance is max{M, N}. In order to obtain an alignment * distance between 0 and 1 we can just return D as specified above, divided by max{M, N}. Note that * this implies that the triangular inequality does not hold anymore. * * @author Benjamin Paassen - [email protected] * @param the class of the elements in the left input sequence. * @param the class of the elements in the right input sequence. * @param The result class. */ public abstract class AbstractStrictAlignmentAlgorithm implements AlignmentAlgorithm { private Comparator comparator; private final Class resultClass; private double[][] lastAlignmentMatrix; public AbstractStrictAlignmentAlgorithm(@NonNull Comparator comparator, @NonNull Class resultClass) { ComparatorValidator.validate(this, comparator); this.comparator = comparator; this.resultClass = resultClass; } @Override public Class getResultClass() { return resultClass; } @Override public Comparator getComparator() { return comparator; } @Override public void setComparator(@NonNull Comparator comparator) { ComparatorValidator.validate(this, comparator); this.comparator = comparator; } /** * The last matrix that was calculated using this algorithm. * * @return last matrix that was calculated using this algorithm. */ public double[][] getLastAlignmentMatrix() { return lastAlignmentMatrix; } @Override public boolean requires(@NonNull OperationType type) { switch (type) { case DELETION: case INSERTION: case REPLACEMENT: return true; default: return false; } } @Override public R calculateAlignment(@NonNull final List a, @NonNull final List b) { final int m = a.size(); final int n = b.size(); // store the operation costs to re-use that information final double[] delCosts = new double[m]; for (int i = 0; i < m; i++) { delCosts[i] = comparator.compare(OperationType.DELETION, a.get(i), null); } final double[] insCosts = new double[n]; for (int j = 0; j < n; j++) { insCosts[j] = comparator.compare(OperationType.INSERTION, null, b.get(j)); } final double[][] repCosts = new double[m][n]; for (int i = 0; i < m; i++) { final X x = a.get(i); for (int j = 0; j < n; j++) { repCosts[i][j] = comparator.compare(OperationType.REPLACEMENT, x, b.get(j)); } } //initialize the alignment matrix. final double[][] alignMat = new double[m + 1][n + 1]; //initialize first column, which means the deletion of the entire sequence a. for (int i = 1; i <= m; i++) { alignMat[i][0] = alignMat[i - 1][0] + delCosts[i - 1]; } //initialize the first row, which means the insertion of the entire sequence b. for (int j = 1; j <= n; j++) { alignMat[0][j] = alignMat[0][j - 1] + insCosts[j - 1]; } //now start the alignment. for (int i = 1; i <= m; i++) { for (int j = 1; j <= n; j++) { //calculate the new alignment matrix entry. alignMat[i][j] = Math.min( alignMat[i - 1][j - 1] + repCosts[i - 1][j - 1], Math.min( alignMat[i - 1][j] + delCosts[i - 1], alignMat[i][j - 1] + insCosts[j - 1] ) ); } } final R result = transformToResult(alignMat, repCosts, delCosts, insCosts, a, b); lastAlignmentMatrix = alignMat; return result; } /** * This method has to be implemented by sub classes to transform a calculated dynamic * programming matrix to a valid result of that implementation. This also has to implement the * backtracing if necessary. * * @param alignmentMatrix a dynamic programming matrix calculated with respect to both input * sequences. * @param repCosts the matrix of pairwise REPLACEMENT costs for each pairwise combination of * elements in the input sequences. * @param delCosts the vector of DELETION costs for each element of the left input sequence. * @param insCosts the vector of INSERTION costs for each element of the right input sequence. * @param a the first input sequence. * @param b the second input sequence. * * @return a valid result for this algorithm implementation. */ public abstract R transformToResult(@NonNull double[][] alignmentMatrix, @NonNull double[][] repCosts, @NonNull double[] delCosts, @NonNull double[] insCosts, @NonNull final List a, @NonNull final List b); /** * Normalizes the given raw distance by the worst case that could occur in an alignment of the * two sequences: In the worst case, we replace all elements in a with elements in b and * delete/insert the remaining elements in the longer sequence. The cost of those operations can * be 1 at worst, if the Comparator is properly normalized. Thus, assuming sequence lengths m * and n respectively, we divide the raw distance by max{m, n} for normalization. * * @param the class of elements in the first input sequence. * @param the class of elements in the second input sequence. * @param d the raw alignment distance between sequences a and b in the range [0,infinity) * @param a the left-hand input sequence. * @param b the right-hand input sequence. * * @return the normalized alignment distance between sequences a and b in the range [0,1]. */ public static double normalizeDissimilarity(double d, @NonNull final List a, @NonNull final List b) { final int normalization = Math.max(a.size(), b.size()); if (normalization == 0) { if (d != 0) { throw new IllegalArgumentException("Unexpected internal state: " + "Two aligned empty sequnces lead to a dissimilarity " + "other than 0."); } return 0; } else { return d / normalization; } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy