
de.citec.tcs.alignment.AbstractAffineAlignmentAlgorithm Maven / Gradle / Ivy
/*
* TCS Alignment Toolbox
*
* Copyright (C) 2013-2015
* Benjamin Paaßen, Georg Zentgraf
* AG Theoretical Computer Science
* Centre of Excellence Cognitive Interaction Technology (CITEC)
* University of Bielefeld
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*/
package de.citec.tcs.alignment;
import de.citec.tcs.alignment.comparators.SkipComparator;
import de.citec.tcs.alignment.sequence.Node;
import de.citec.tcs.alignment.sequence.Sequence;
import java.util.ArrayList;
import java.util.EnumMap;
/**
* This is an implementation of the affine alignment algorithm scheme
* combining actually two approaches: First the algorithm of Gotoh et al.
* for gap extensions (meaning, that we allow to skip longer regions in the
* middle of an alignment at low cost) and second the traditional local
* alignment in the sense that we allow skips at the beginning and end of an
* alignment. The task here is to identify those regions of the two input
* sequence that match the best and skip the rest of the sequences at low cost.
*
* A special thanks goes to Prof. Robert Giegerich, Dr. Stefan Janssen and
* Benedikt Löwes at Bielefeld University for teaching me Advanced Dynamic
* Programming. Otherwise this implementation here would not have been possible.
*
* The abstract algorithm that inspires this implementation is described in
* Dr. Georg Sauthoffs "Bellman's Gap" GAP-Language in the file
* bellmans_gap/affine_alignment.gap.
*
* @author Benjamin Paassen - [email protected]
* @param the result class of this alignment algorithm.
*/
public abstract class AbstractAffineAlignmentAlgorithm implements SkipAlignmentAlgorithm {
private final Class resultClass;
private final AlignmentSpecification alignmentSpecification;
private double weightThreshold = 0;
private int minMiddleSkips = 4;
public AbstractAffineAlignmentAlgorithm(Class resultClass,
AlignmentSpecification alignmentSpecification) {
this.resultClass = resultClass;
this.alignmentSpecification = alignmentSpecification;
}
/**
* {@inheritDoc }
*/
@Override
public AlignmentSpecification getSpecification() {
return alignmentSpecification;
}
/**
* {@inheritDoc }
*/
@Override
public Class getResultClass() {
return resultClass;
}
/**
* Set a weight threshold (between 0 and 1) that determines which keywords
* should be ignored during calculation because their weight is negligible.
*
* The default value is 0.
*
* @param weightThreshold a weight threshold (between 0 and 1)
*/
public void setWeightThreshold(double weightThreshold) {
if (weightThreshold < 0 || weightThreshold > 1) {
throw new RuntimeException("A weight threshold has to be between 0 and 1!");
}
this.weightThreshold = weightThreshold;
}
/**
*
* @return The current weight threshold (0 per default).
*/
public double getWeightThreshold() {
return weightThreshold;
}
/**
* The minimum number of skips that have to be done in the middle
* of an alignment. Otherwise only deletions and insertions are allowed.
* This is 4 per default.
*
* @param minMiddleSkips the minimum number of skips that have to be done in
* the middle of an alignment.
*/
public void setMinMiddleSkips(int minMiddleSkips) {
this.minMiddleSkips = minMiddleSkips;
}
/**
* The minimum number of skips that have to be done in the middle
* of an alignment. Otherwise only deletions and insertions are allowed.
* This is 4 per default.
*
* @return the minimum number of skips that have to be done in the middle
* of an alignment.
*/
public int getMinMiddleSkips() {
return minMiddleSkips;
}
/**
* {@inheritDoc }
*/
@Override
public R calculateAlignment(Sequence a, Sequence b) {
//check validity
if (a.getNodeSpecification() != alignmentSpecification.getNodeSpecification()
&& !a.getNodeSpecification().equals(alignmentSpecification.getNodeSpecification())) {
throw new RuntimeException(
"The first input sequence has an unexpected node specification!");
}
if (a.getNodeSpecification() != b.getNodeSpecification()
&& !a.getNodeSpecification().equals(b.getNodeSpecification())) {
throw new RuntimeException(
"The node specifications of both input sequences to not match!");
}
//check validity of comparators.
for (int k = 0; k < alignmentSpecification.size(); k++) {
if (!(alignmentSpecification.getComparator(k) instanceof SkipComparator)) {
throw new RuntimeException("The comparator for keyword "
+ alignmentSpecification.getKeyword(k) + " does not support skips!");
}
}
//identify the subset of comparators that have an above threshold weighting.
final ArrayList relevantIndices = new ArrayList<>();
for (int k = 0; k < alignmentSpecification.size(); k++) {
if (alignmentSpecification.getWeighting()[k] > weightThreshold) {
relevantIndices.add(k);
}
}
//pre-calculation caching.
final SkipComparator[] comparators = new SkipComparator[relevantIndices.size()];
final double[] weights = new double[relevantIndices.size()];
final int[] originalIndices = new int[relevantIndices.size()];
for (int k = 0; k < comparators.length; k++) {
comparators[k] = (SkipComparator) alignmentSpecification.getComparator(relevantIndices.get(k));
weights[k] = alignmentSpecification.getWeighting()[relevantIndices.get(k)];
originalIndices[k] = alignmentSpecification.getOriginalIndex(relevantIndices.get(k));
}
final int M = a.getNodes().size();
final int N = b.getNodes().size();
/*
* Initialize the dynamic programming tables for each recurrence.
*/
final EnumMap dp_tables = new EnumMap<>(Recurrence.class);
for (final Recurrence rec : Recurrence.values()) {
dp_tables.put(rec, new double[M + 1][N + 1]);
}
/*
* As we have more recurrences than operations it is probably better
* to pre-calculate the local operation costs.
*/
Node left;
Node right;
final double[][] compareMatrix = new double[M][N];
if (N > 0) {
for (int i = 0; i < M; i++) {
for (int j = 0; j < N; j++) {
left = a.getNodes().get(i);
right = b.getNodes().get(j);
for (int k = 0; k < comparators.length; k++) {
compareMatrix[i][j] += weights[k] * comparators[k].compare(
left.getValue(originalIndices[k]),
right.getValue(originalIndices[k]));
}
}
}
}
final double[] delMatrix = new double[M];
final double[] insMatrix = new double[N];
final double[] skipDelMatrix = new double[M];
final double[] skipInsMatrix = new double[N];
for (int i = 0; i < M; i++) {
left = a.getNodes().get(i);
for (int k = 0; k < comparators.length; k++) {
delMatrix[i] += weights[k] * comparators[k].delete(
left.getValue(originalIndices[k]));
skipDelMatrix[i] += weights[k] * comparators[k].skipDelete(
left.getValue(originalIndices[k]));
}
}
for (int j = 0; j < N; j++) {
right = b.getNodes().get(j);
for (int k = 0; k < comparators.length; k++) {
insMatrix[j] += weights[k] * comparators[k].insert(
right.getValue(originalIndices[k]));
skipInsMatrix[j] += weights[k] * comparators[k].skipInsert(
right.getValue(originalIndices[k]));
}
}
/*
* I'll quote the corresponding line of the Bellmans Gap grammar
* here to make clear, what I'm doing.
*/
/*
* Calculate the SKIPINS_END and SKIPDEL_END recurrence first.
*
* SKIPINS_END = skip_ins(, SKIPINS_END) |
* nil() #h;
* SKIPDEL_END = skip_del(, SKIPDEL_END) | SKIPINS_END
* #h;
*
* This leads to:
* SKIPINS_END(M,N) = 0
* SKIPINS_END(M,j) = SKIPINS_END(M,j+1) + skip_ins(j) where j < N
* SKIPDEL_END(M,j) = SKIPINS_END(M,j)
* SKIPDEL_END(i,j) = SKIPINS_END(i+1,j) + skip_del(i) where i < M
*/
for (int j = N - 1; j >= 0; j--) {
dp_tables.get(Recurrence.SKIPINS_END)[M][j]
= dp_tables.get(Recurrence.SKIPINS_END)[M][j + 1]
+ skipInsMatrix[j];
dp_tables.get(Recurrence.SKIPDEL_END)[M][j]
= dp_tables.get(Recurrence.SKIPINS_END)[M][j];
}
for (int i = M - 1; i >= 0; i--) {
for (int j = N; j >= 0; j--) {
dp_tables.get(Recurrence.SKIPDEL_END)[i][j]
= dp_tables.get(Recurrence.SKIPDEL_END)[i + 1][j]
+ skipDelMatrix[i];
}
}
/*
* Now we can initialize the last row and column of the ALI
* recurrence, which is defined as:
*
* ALI = del(, DEL) | ins(, INS) |
* rep(, ALI) |
* SKIPDEL_MIDDLE | SKIPINS_MIDDLE | SKIPDEL_END #h;
*
* The DEL, INS, SKIPDEL_MIDDLE and SKIPINS_MIDDLE recurrences
* depend on ALI themselves, thus the only choice at the end is
* SKIPDEL_END.
*
* ALI = SKIPDEL_END;
*
*/
/*
* Calculate this part of the ALI matrix:
*
* -----------
* -----------
* -----------
* -----------
* -----------
* xxxxxxxxxx-
*/
for (int i = M - 1; i >= 0; i--) {
dp_tables.get(Recurrence.ALI)[i][N]
= dp_tables.get(Recurrence.SKIPDEL_END)[i][N];
}
/*
* Calculate this part of the ALI matrix:
*
* ----------x
* ----------x
* ----------x
* ----------x
* ----------x
* ----------x
*/
for (int j = N; j >= 0; j--) {
dp_tables.get(Recurrence.ALI)[M][j]
= dp_tables.get(Recurrence.SKIPDEL_END)[M][j];
}
/*
* We now calculate the part of ALI where we can not yet use the
* middle skips because we have not yet reached the necessary
* minimum number of letters to skip (and we also need one replacement
* at the end). That marks this part of the ALI matrix:
*
* -----------
* ------xxxx-
* ------xxxx-
* ------xxxx-
* ------xxxx-
* -----------
*/
final int leftPreMiddleSkipsLimit = Math.max(-1, M - minMiddleSkips - 1);
final int rightPreMiddleSkipsLimit = Math.max(-1, N - minMiddleSkips - 1);
/*
* At position [M-1][N-1] of ALI, we can either use
* SKIPDEL_END or replacements. We can also initialize that position
* for the DEL and INS recurrence, because it is very similar.
*
* ALI = rep(, ALI) | SKIPDEL_END #h;
* DEL = rep(, ALI);
* INS = rep(, ALI);
*
* -----------
* -----------
* -----------
* -----------
* ---------x-
* -----------
*/
if (M > 0 && N > 0) {
final double repLocal = dp_tables.get(Recurrence.ALI)[M][N]
+ compareMatrix[M - 1][N - 1];
dp_tables.get(Recurrence.ALI)[M - 1][N - 1] = choice(
dp_tables.get(Recurrence.SKIPDEL_END)[M - 1][N - 1],
repLocal);
dp_tables.get(Recurrence.DEL)[M - 1][N - 1] = repLocal;
dp_tables.get(Recurrence.INS)[M - 1][N - 1] = repLocal;
}
/*
* In the second-last row we can use deletions, but no insertions.
* We will also calculate the DEL and INS recurrence here, which are
* defined as:
*
* DEL = del(, DEL) | ins(, INS) |
* rep(, ALI) #h;
* INS = ins(, INS) | rep(, ALI) #h;
*
* Relevant for this part are:
*
* ALI = del(, DEL) | rep(, ALI) | SKIPDEL_END
* #h;
* DEL = del(, DEL) | rep(, ALI) #h;
* INS = rep(, ALI) #h;
*
* Calculate this part of the ALI matrix:
*
* -----------
* -----------
* -----------
* -----------
* -----xxxx--
* -----------
*/
if (N > 0) {
for (int i = M - 2; i >= 0; i--) {
final double repLocal
= dp_tables.get(Recurrence.ALI)[i + 1][N]
+ compareMatrix[i][N - 1];
final double delLocal
= dp_tables.get(Recurrence.DEL)[i + 1][N - 1]
+ delMatrix[i];
//calculate ALI
if (i > leftPreMiddleSkipsLimit) {
dp_tables.get(Recurrence.ALI)[i][N - 1] = choice(
repLocal,
delLocal,
dp_tables.get(Recurrence.SKIPDEL_END)[i][N - 1]);
}
//calculate DEL
dp_tables.get(Recurrence.DEL)[i][N - 1] = choice(
repLocal, delLocal);
//calculate INS
dp_tables.get(Recurrence.INS)[i][N - 1] = repLocal;
}
}
/*
* In the second last column we can use insertions, but no deletions.
*
* Relevant for this part is:
*
* ALI = ins(, INS) | rep(, ALI) |
* SKIPDEL_END #h;
*
* DEL = ins(, INS) | rep(, ALI) #h;
*
* INS = ins(, INS) | rep(, ALI) #h;
*
* Calculate this part of the ALI matrix:
*
* -----------
* ---------x-
* ---------x-
* ---------x-
* -----------
* -----------
*/
if (M > 0) {
for (int j = N - 2; j >= 0; j--) {
final double repLocal
= dp_tables.get(Recurrence.ALI)[M][j + 1]
+ compareMatrix[M - 1][j];
final double insLocal
= dp_tables.get(Recurrence.INS)[M - 1][j + 1]
+ insMatrix[j];
//calculate ALI
if (j > rightPreMiddleSkipsLimit) {
dp_tables.get(Recurrence.ALI)[M - 1][j] = choice(
repLocal,
insLocal,
dp_tables.get(Recurrence.SKIPDEL_END)[M - 1][j]);
}
//calculate DEL
dp_tables.get(Recurrence.DEL)[M - 1][j] = choice(
repLocal, insLocal);
//calculate INS
dp_tables.get(Recurrence.INS)[M - 1][j] = choice(
repLocal, insLocal);
}
}
/*
* In the next part of the matrix, we can calculate insertions as well
* as deletions, but no skips yet. So the relevant recurrences are:
*
* ALI = del(, DEL) | ins(, INS) |
* rep(, ALI) | SKIPDEL_END #h;
*
* DEL = del(, DEL) | ins(, INS) |
* rep(, ALI) #h;
*
* INS = ins(, INS) | rep(, ALI) #h;
*/
/*
* Calculate this part of the ALI matrix:
*
* -----------
* ------xxx--
* ------xxx--
* ------xxx--
* -----------
* -----------
*/
if (N > 1) {
for (int i = M - 2; i > leftPreMiddleSkipsLimit; i--) {
for (int j = N - 2; j > rightPreMiddleSkipsLimit; j--) {
final double[] choices = new double[4];
choices[0] = dp_tables.get(Recurrence.DEL)[i + 1][j]
+ delMatrix[i];
choices[1] = dp_tables.get(Recurrence.INS)[i][j + 1]
+ insMatrix[j];
choices[2] = dp_tables.get(Recurrence.ALI)[i + 1][j + 1]
+ compareMatrix[i][j];
choices[3] = dp_tables.get(Recurrence.SKIPDEL_END)[i][j];
dp_tables.get(Recurrence.ALI)[i][j] = choice(choices);
dp_tables.get(Recurrence.DEL)[i][j] = choice(
choices[0], choices[1], choices[2]);
dp_tables.get(Recurrence.INS)[i][j] = choice(
choices[1], choices[2]);
}
}
}
/*
* We do the same for the part of the ALI matrix where we only can
* use deletions and skip-deletions, but no insertions. So the
* relevant recurrences are:
*
* ALI = del(, DEL) | rep(, ALI) |
* SKIPDEL_MIDDLE | SKIPDEL_END #h;
*
* DEL = del(, DEL) | ins(, INS) |
* rep(, ALI) #h;
*
* SKIPDEL_MIDDLE = skip_del(,
* skip_del(,
* skip_del(,
* skip_del(,
* SKIPDEL_MIDDLE_LOOP)))) #h;
*
* SKIPDEL_MIDDLE_LOOP = skip_del(, SKIPDEL_MIDDLE_LOOP) |
* rep(,ALI) #h;
*
* So we also need to calculate the SKIPDEL_MIDDLE_LOOP recurrence,
* which we call SKIPDEL_MIDDLE here, because we do not need to tabulate
* the SKIPDEL_MIDDLE reccurence. This can be circumvented by
* converting the grammar to:
*
* ALI = del(, DEL) | rep(, ALI) |
* skip_del(,
* skip_del(,
* skip_del(,
* skip_del(,
* SKIPDEL_MIDDLE_LOOP)))) | SKIPDEL_END #h;
*
* So the SKIPDEL_MIDDLE recurrence is just for better readability in
* the gap-L-code and not needed per se.
*
* calculate this part of the ALI matrix:
*
* -----------
* -----------
* -----------
* -----------
* xxxxxx-----
* -----------
*
*/
if (M > minMiddleSkips + 1 && N > 0) {
//initialize second-last column of SKIPDEL_MIDDLE
for (int j = N - 1; j >= 0; j--) {
dp_tables.get(Recurrence.SKIPDEL_MIDDLE)[M - 1][j]
= dp_tables.get(Recurrence.ALI)[M][j + 1]
+ compareMatrix[M - 1][j];
}
//initialize second-last row of SKIPDEL_MIDDLE up to the desired point.
for (int i = M - 2; i > leftPreMiddleSkipsLimit; i--) {
final double repLocal = dp_tables.get(Recurrence.ALI)[i + 1][N]
+ compareMatrix[i][N - 1];
dp_tables.get(Recurrence.SKIPDEL_MIDDLE)[i][N - 1] = choice(
dp_tables.get(Recurrence.SKIPDEL_MIDDLE)[i + 1][N - 1]
+ skipDelMatrix[i], repLocal);
}
for (int i = leftPreMiddleSkipsLimit; i >= 0; i--) {
//calculate SKIPDEL_MIDDLE
final double repLocal = dp_tables.get(Recurrence.ALI)[i + 1][N]
+ compareMatrix[i][N - 1];
if (i >= minMiddleSkips) {
dp_tables.get(Recurrence.SKIPDEL_MIDDLE)[i][N - 1] = choice(
dp_tables.get(Recurrence.SKIPDEL_MIDDLE)[i + 1][N - 1]
+ skipDelMatrix[i], repLocal);
}
//calculate ALI
double skipDelMiddle = dp_tables.get(
Recurrence.SKIPDEL_MIDDLE)[i + minMiddleSkips][N - 1];
for (int i2 = 0; i2 < minMiddleSkips; i2++) {
skipDelMiddle += skipDelMatrix[i + i2];
}
dp_tables.get(Recurrence.ALI)[i][N - 1] = choice(
skipDelMiddle,
repLocal,
dp_tables.get(Recurrence.DEL)[i + 1][N - 1]
+ delMatrix[i]
);
}
}
/*
* Now we calculate this part of the ALI matrix:
*
* ---------x-
* -----------
* -----------
* -----------
* -----------
* -----------
*
* where it is possible to insert and skip insert, but not to use
* deletions. The relevant recurrences are.
*
* ALI = ins(, INS) | rep(, ALI) |
* SKIPINS_MIDDLE | SKIPDEL_END #h;
*
* INS = ins(, INS) | rep(, ALI) #h;
*
* SKIPINS_MIDDLE = skip_ins(,
* skip_ins(,
* skip_ins(,
* skip_ins(,
* SKIPINS_MIDDLE_LOOP)))) #h;
*
* SKIPINS_MIDDLE_LOOP = skip_ins(, SKIPINS_MIDDLE_LOOP) |
* rep(,ALI) #h;
*
* So we also need to calculate the SKIPINS_MIDDLE_LOOP recurrence,
* which we call SKIPINS_MIDDLE here, because we do not need to tabulate
* the SKIPINS_MIDDLE reccurence. This can be circumvented by
* converting the grammar to:
*
* ALI = ins(, INS) | rep(, ALI) |
* skip_ins(,
* skip_ins(,
* skip_ins(,
* skip_ins(,
* SKIPINS_MIDDLE_LOOP)))) | SKIPDEL_END #h;
*
* So the SKIPINS_MIDDLE recurrence is just for better readability in
* the gap-L-code and not needed per se.
*/
if (M > 0 && N > minMiddleSkips + 1) {
//initialize second-last row of SKIPINS_MIDDLE
for (int i = M - 1; i >= 0; i--) {
dp_tables.get(Recurrence.SKIPINS_MIDDLE)[i][N - 1]
= dp_tables.get(Recurrence.ALI)[i + 1][N]
+ compareMatrix[i][N - 1];
}
//initialize second last column up to the desired point.
for (int j = N - 2; j > rightPreMiddleSkipsLimit; j--) {
final double repLocal = dp_tables.get(Recurrence.ALI)[M][j + 1]
+ compareMatrix[M - 1][j];
dp_tables.get(Recurrence.SKIPINS_MIDDLE)[M - 1][j] = choice(
dp_tables.get(Recurrence.SKIPINS_MIDDLE)[M - 1][j + 1]
+ skipInsMatrix[j], repLocal);
}
for (int j = rightPreMiddleSkipsLimit; j >= 0; j--) {
//calculate SKIPINS_MIDDLE
final double repLocal = dp_tables.get(Recurrence.ALI)[M][j + 1]
+ compareMatrix[M - 1][j];
if (j >= minMiddleSkips) {
dp_tables.get(Recurrence.SKIPINS_MIDDLE)[M - 1][j] = choice(
dp_tables.get(Recurrence.SKIPINS_MIDDLE)[M - 1][j + 1]
+ skipInsMatrix[j], repLocal);
}
//calculate ALI
double skipInsMiddle = dp_tables.get(
Recurrence.SKIPINS_MIDDLE)[M - 1][j + minMiddleSkips];
for (int j2 = 0; j2 < minMiddleSkips; j2++) {
skipInsMiddle += skipInsMatrix[j + j2];
}
dp_tables.get(Recurrence.ALI)[M - 1][j] = choice(
skipInsMiddle,
repLocal,
dp_tables.get(Recurrence.INS)[M - 1][j + 1]
+ insMatrix[j]
);
}
}
/*
* Calculate SKIPINS_MIDDLE and SKIPDEL_MIDDLE for this section
* of the matrix as well:
*
* -----------
* ------xxx--
* ------xxx--
* ------xxx--
* -----------
* -----------
*
*/
if (N > 1) {
for (int i = M - 2; i > leftPreMiddleSkipsLimit; i--) {
for (int j = N - 2; j > rightPreMiddleSkipsLimit; j--) {
final double repLocal
= dp_tables.get(Recurrence.ALI)[i + 1][j + 1]
+ compareMatrix[i][j];
dp_tables.get(Recurrence.SKIPDEL_MIDDLE)[i][j] = choice(
dp_tables.get(Recurrence.SKIPDEL_MIDDLE)[i + 1][j]
+ skipDelMatrix[i], repLocal);
dp_tables.get(Recurrence.SKIPINS_MIDDLE)[i][j] = choice(
dp_tables.get(Recurrence.SKIPINS_MIDDLE)[i][j + 1]
+ skipInsMatrix[j], repLocal);
}
}
}
/*
* Now we calculate that part of the ALI matrix where we can use
* deletions, skip deletions and insertions, but no skip insertions.
*
* So the relevant part of the grammar ist:
*
* ALI = del(, DEL) | ins(, INS) |
* rep(, ALI) | SKIPDEL_MIDDLE | SKIPDEL_END #h;
*
* DEL = del(, DEL) | ins(, INS) |
* rep(, ALI) #h;
*
* INS = ins(, INS) | rep(, ALI) #h;
*
* SKIPDEL_MIDDLE = skip_del(,
* skip_del(,
* skip_del(,
* skip_del(,
* SKIPDEL_MIDDLE_LOOP)))) #h;
*
* SKIPDEL_MIDDLE_LOOP = skip_del(, SKIPDEL_MIDDLE_LOOP) |
* rep(,ALI) #h;
*
* Calculate this part of the matrix:
*
* -----------
* xxxxxx-----
* xxxxxx-----
* xxxxxx-----
* -----------
* -----------
*/
if (M > minMiddleSkips + 1) {
for (int j = N - 2; j > rightPreMiddleSkipsLimit; j--) {
for (int i = leftPreMiddleSkipsLimit; i >= 0; i--) {
//calculate choices
final double[] choices = new double[5];
choices[0] = dp_tables.get(Recurrence.DEL)[i + 1][j] + delMatrix[i];
choices[1] = dp_tables.get(Recurrence.INS)[i][j + 1] + insMatrix[j];
choices[2] = dp_tables.get(Recurrence.ALI)[i + 1][j + 1]
+ compareMatrix[i][j];
choices[3] = dp_tables.get(Recurrence.SKIPDEL_END)[i][j];
//calculate SKIPDEL_MIDDLE
double skipDelMiddle = dp_tables.get(
Recurrence.SKIPDEL_MIDDLE)[i + minMiddleSkips][j];
for (int i2 = 0; i2 < minMiddleSkips; i2++) {
skipDelMiddle += skipDelMatrix[i + i2];
}
choices[4] = skipDelMiddle;
dp_tables.get(Recurrence.ALI)[i][j] = choice(choices);
dp_tables.get(Recurrence.DEL)[i][j] = choice(
choices[0], choices[1], choices[2]);
dp_tables.get(Recurrence.INS)[i][j] = choice(
choices[1], choices[2]);
//also calculate the SKIPDEL_MIDDLE and SKIPINS_MIDDLE
//recurrence.
if (i >= minMiddleSkips) {
dp_tables.get(Recurrence.SKIPDEL_MIDDLE)[i][j] = choice(
choices[2],
dp_tables.get(Recurrence.SKIPDEL_MIDDLE)[i + 1][j]
+ skipDelMatrix[i]
);
}
if (j >= minMiddleSkips) {
dp_tables.get(Recurrence.SKIPINS_MIDDLE)[i][j] = choice(
choices[2],
dp_tables.get(Recurrence.SKIPINS_MIDDLE)[i][j + 1]
+ skipInsMatrix[j]
);
}
}
}
}
/*
* Now we calculate the part of the ALI matrix, where we can use
* insertions, skip insertions and deletions, but no skip deletions.
* So the relevant part of the grammar ist:
*
* ALI = del(, DEL) | ins(, INS) |
* rep(, ALI) |
* SKIPDEL_MIDDLE | SKIPINS_MIDDLE | SKIPDEL_END #h;
*
* DEL = del(, DEL) | ins(, INS) |
* rep(, ALI) #h;
*
* INS = ins(, INS) | rep(, ALI) #h;
*
* SKIPINS_MIDDLE = skip_ins(,
* skip_ins(,
* skip_ins(,
* skip_ins(,
* SKIPINS_MIDDLE_LOOP)))) #h;
*
* SKIPINS_MIDDLE_LOOP = skip_ins(, SKIPINS_MIDDLE_LOOP) |
* rep(,ALI) #h;
*
* Calculate this part of the matrix:
*
* ------xxx--
* -----------
* -----------
* -----------
* -----------
* -----------
*/
if (N > minMiddleSkips + 1) {
for (int i = M - 2; i > leftPreMiddleSkipsLimit; i--) {
for (int j = rightPreMiddleSkipsLimit; j >= 0; j--) {
//calculate choices
final double[] choices = new double[5];
choices[0] = dp_tables.get(Recurrence.DEL)[i + 1][j] + delMatrix[i];
choices[1] = dp_tables.get(Recurrence.INS)[i][j + 1] + insMatrix[j];
choices[2] = dp_tables.get(Recurrence.ALI)[i + 1][j + 1]
+ compareMatrix[i][j];
choices[3] = dp_tables.get(Recurrence.SKIPDEL_END)[i][j];
//calculate SKIPINS_MIDDLE
double skipInsMiddle = dp_tables.get(
Recurrence.SKIPINS_MIDDLE)[i][j + minMiddleSkips];
for (int j2 = 0; j2 < minMiddleSkips; j2++) {
skipInsMiddle += skipInsMatrix[j + j2];
}
choices[4] = skipInsMiddle;
dp_tables.get(Recurrence.ALI)[i][j] = choice(choices);
dp_tables.get(Recurrence.DEL)[i][j] = choice(
choices[0], choices[1], choices[2]);
dp_tables.get(Recurrence.INS)[i][j] = choice(
choices[1], choices[2]);
//also calculate the SKIPDEL_MIDDLE and SKIPINS_MIDDLE
//recurrence.
if (i >= minMiddleSkips) {
dp_tables.get(Recurrence.SKIPDEL_MIDDLE)[i][j] = choice(
choices[2],
dp_tables.get(Recurrence.SKIPDEL_MIDDLE)[i + 1][j]
+ skipDelMatrix[i]
);
}
if (j >= minMiddleSkips) {
dp_tables.get(Recurrence.SKIPINS_MIDDLE)[i][j] = choice(
choices[2],
dp_tables.get(Recurrence.SKIPINS_MIDDLE)[i][j + 1]
+ skipInsMatrix[j]
);
}
}
}
}
/*
* Finally we calculate the rest of the ALI matrix, where we can use
* the full reccurence:
*
* ALI = del(, DEL) | ins(, INS) |
* rep(, ALI) | SKIPDEL_MIDDLE | SKIPINS_MIDDLE |
* SKIPDEL_END #h;
*
* DEL = del(, DEL) | ins(, INS) |
* rep(, ALI) #h;
*
* INS = ins(, INS) | rep(, ALI) #h;
*
* SKIPDEL_MIDDLE = skip_del(,
* skip_del(,
* skip_del(,
* skip_del(,
* SKIPDEL_MIDDLE_LOOP)))) #h;
*
* SKIPDEL_MIDDLE_LOOP = skip_del(, SKIPDEL_MIDDLE_LOOP) |
* rep(,ALI) #h;
*
* SKIPINS_MIDDLE = skip_ins(,
* skip_ins(,
* skip_ins(,
* skip_ins(,
* SKIPINS_MIDDLE_LOOP)))) #h;
*
* SKIPINS_MIDDLE_LOOP = skip_ins(, SKIPINS_MIDDLE_LOOP) |
* rep(,ALI) #h;
*
*
* Calculate the rest of the matrix:
*
* xxxxxx-----
* -----------
* -----------
* -----------
* -----------
* -----------
*/
if (M > minMiddleSkips + 1 && N > minMiddleSkips + 1) {
for (int i = leftPreMiddleSkipsLimit; i >= 0; i--) {
for (int j = rightPreMiddleSkipsLimit; j >= 0; j--) {
//calculate SKIPDEL_MIDDLE
double skipDelMiddle = dp_tables.get(
Recurrence.SKIPDEL_MIDDLE)[i + minMiddleSkips][j];
for (int i2 = 0; i2 < minMiddleSkips; i2++) {
skipDelMiddle += skipDelMatrix[i + i2];
}
if (i >= minMiddleSkips) {
dp_tables.get(Recurrence.SKIPDEL_MIDDLE)[i][j] = choice(
dp_tables.get(Recurrence.ALI)[i + 1][j + 1]
+ compareMatrix[i][j],
dp_tables.get(Recurrence.SKIPDEL_MIDDLE)[i + 1][j]
+ skipDelMatrix[i]
);
}
//calculate SKIPINS_MIDDLE
double skipInsMiddle = dp_tables.get(
Recurrence.SKIPINS_MIDDLE)[i][j + minMiddleSkips];
for (int j2 = 0; j2 < minMiddleSkips; j2++) {
skipInsMiddle += skipInsMatrix[j + j2];
}
if (j >= minMiddleSkips) {
dp_tables.get(Recurrence.SKIPINS_MIDDLE)[i][j] = choice(
dp_tables.get(Recurrence.ALI)[i + 1][j + 1]
+ compareMatrix[i][j],
dp_tables.get(Recurrence.SKIPINS_MIDDLE)[i][j + 1]
+ skipInsMatrix[j]
);
}
//calculate other choices
final double[] choices = new double[6];
choices[0] = dp_tables.get(Recurrence.DEL)[i + 1][j]
+ delMatrix[i];
choices[1] = dp_tables.get(Recurrence.INS)[i][j + 1]
+ insMatrix[j];
choices[2] = dp_tables.get(Recurrence.ALI)[i + 1][j + 1]
+ compareMatrix[i][j];
choices[3] = dp_tables.get(Recurrence.SKIPDEL_END)[i][j];
choices[4] = skipDelMiddle;
choices[5] = skipInsMiddle;
//calculate ALI, DEL and INS.
dp_tables.get(Recurrence.ALI)[i][j] = choice(choices);
dp_tables.get(Recurrence.DEL)[i][j] = choice(
choices[0], choices[1], choices[2]);
dp_tables.get(Recurrence.INS)[i][j] = choice(
choices[1], choices[2]);
}
}
}
/*
* After that it gets easy again: We can now compute the SKIPINS_START
* recurrence.
*
* SKIPINS_START = skip_ins(, SKIPINS_START) |
* rep(, ALI) | nil() #h;
*/
//initialize last column
for (int j = N - 1; j >= 0; j--) {
dp_tables.get(Recurrence.SKIPINS_START)[M][j]
= dp_tables.get(Recurrence.SKIPINS_START)[M][j + 1] + skipInsMatrix[j];
}
//initialize second-last row.
if (N > 0) {
for (int i = M - 1; i >= 0; i--) {
dp_tables.get(Recurrence.SKIPINS_START)[i][N - 1]
= dp_tables.get(Recurrence.ALI)[i + 1][N] + compareMatrix[i][N - 1];
}
}
//calculate the rest of the matrix.
if (N > 1) {
for (int i = M - 1; i >= 0; i--) {
for (int j = N - 2; j >= 0; j--) {
dp_tables.get(Recurrence.SKIPINS_START)[i][j] = choice(
dp_tables.get(Recurrence.SKIPINS_START)[i][j + 1] + skipInsMatrix[j],
dp_tables.get(Recurrence.ALI)[i + 1][j + 1] + compareMatrix[i][j]);
}
}
}
/*
* And finally we can compute the SKIPDEL_START recurrence.
*
* SKIPDEL_START = skip_del(, SKIPDEL_START) | SKIPINS_START
* #h;
*/
//as this only goes to left we just need the first row here.
//initialize last entry.
dp_tables.get(Recurrence.SKIPDEL_START)[M][0]
= dp_tables.get(Recurrence.SKIPINS_START)[M][0];
//calculate the rest of the row
if (N > 0) {
for (int i = M - 1; i >= 0; i--) {
dp_tables.get(Recurrence.SKIPDEL_START)[i][0] = choice(
dp_tables.get(Recurrence.SKIPDEL_START)[i + 1][0]
+ skipDelMatrix[i],
dp_tables.get(Recurrence.SKIPINS_START)[i][0]
);
}
} else {
for (int i = M - 1; i >= 0; i--) {
dp_tables.get(Recurrence.SKIPDEL_START)[i][0]
= dp_tables.get(Recurrence.SKIPDEL_START)[i + 1][0]
+ skipDelMatrix[i];
}
}
return transformToResult(dp_tables,
compareMatrix,
delMatrix, insMatrix,
skipDelMatrix, skipInsMatrix,
a, b);
}
public static enum Recurrence {
SKIPDEL_START,
SKIPINS_START,
ALI,
DEL,
INS,
SKIPINS_MIDDLE,
SKIPDEL_MIDDLE,
SKIPINS_END,
SKIPDEL_END;
}
/**
* This should implement the choice function as the term is used
* in the Bellman's Gap context. In the strict case this is a minimum,
* in the soft case a softmin applied on the input.
*
* @param choices the costs of all choices.
*
* @return a cost chosen from the given input.
*/
public abstract double choice(double... choices);
/**
* This method should not be called from outside!
*
* The subclass uses this method to transform the alignment matrix and the
* input sequences to the actual alignment result.
*
* @param dp_tables the dynamic programming matrices used during
* calculation.
* @param compareMatrix the local cost for replacing node i from the first
* sequence with node j from the second sequence.
* @param deletionMatrix the local cost for deleting node i from the first
* sequence.
* @param insertionMatrix the local cost for inserting node j from the
* second sequence into the first sequence.
* @param skipDeletionMatrix the local cost for skipping node i in the
* first sequence.
* @param skipInsertionMatrix the local cost for skipping node j in the
* second sequence.
* @param a the first sequence.
* @param b th second sequence.
*
* @return the actual alignment result.
*/
public abstract R transformToResult(
final EnumMap dp_tables,
final double[][] compareMatrix,
final double[] deletionMatrix,
final double[] insertionMatrix,
final double[] skipDeletionMatrix,
final double[] skipInsertionMatrix,
final Sequence a,
final Sequence b);
//This is debugging code i leave here in case I have to debug dynamic programming
//matrices again.
// private static void debugMatrixPrint(EnumMap dp_matrices,
// Recurrence... recurrences) {
// for (final Recurrence rec : recurrences) {
// System.out.println("\n" + rec + "\n");
// final double[][] mat = dp_matrices.get(rec);
// final int M = mat.length - 1;
// final int N = mat[0].length - 1;
// //header.
// System.out.print(" ");
// for (int i = 0; i <= M; i++) {
// System.out.print("| ");
// final String idxStr = Integer.toString(i);
// System.out.print(idxStr);
// for (int blanks = 2 - idxStr.length(); blanks > 0; blanks--) {
// System.out.print(" ");
// }
// }
// System.out.println("");
// //lines.
// for (int j = 0; j <= N; j++) {
// //front.
// final String idxStr = Integer.toString(j);
// System.out.print(idxStr);
// for (int blanks = 2 - idxStr.length(); blanks > 0; blanks--) {
// System.out.print(" ");
// }
// //entries.
// for (int i = 0; i <= M; i++) {
// System.out.print("|");
// double rounded = (double) Math.round(mat[i][j] * 10) / 10;
// final String numStr = Double.toString(rounded);
// System.out.print(numStr);
// for (int blanks = 3 - numStr.length(); blanks > 0; blanks--) {
// System.out.print(" ");
// }
// }
// System.out.println("");
// }
// }
// }
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy