de.citec.tcs.alignment.StrictKPathAlgorithm Maven / Gradle / Ivy

Go to download
/* 
 * TCS Alignment Toolbox
 * 
 * Copyright (C) 2013-2015
 * Benjamin Paaßen, Georg Zentgraf
 * AG Theoretical Computer Science
 * Centre of Excellence Cognitive Interaction Technology (CITEC)
 * University of Bielefeld
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see .
 */
package de.citec.tcs.alignment;

import de.citec.tcs.alignment.StrictAllOptimalAlgorithm.FullMatrixEntry;
import de.citec.tcs.alignment.comparators.OperationType;
import de.citec.tcs.alignment.sequence.Node;
import de.citec.tcs.alignment.sequence.Sequence;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Random;
import java.util.Stack;
import java.util.TreeMap;

/**
 * This is an implementation of the Needleman-Wunsch-Algorithm for sequence
 * alignment. It returns the k best paths. The scoring scheme is equivalent to
 * the StrictAlignmentScoreAlgorithm.
 *
 * @author Benjamin Paassen - bpaassen(at)techfak.uni-bielefeld.de
 */
public class StrictKPathAlgorithm
		extends AbstractGapAlignmentAlgorithm {

	public static final int DEFAULTPATHLIMIT = 100;
	private int pathLimit = DEFAULTPATHLIMIT;

	/**
	 * This sets up an AlignmentAlgorithm instance according to the given
	 * specification.
	 *
	 * @param alignmentSpecification an AlignmentSpecification.
	 */
	public StrictKPathAlgorithm(AlignmentSpecification alignmentSpecification) {
		super(alignmentSpecification, FullMatrixEntry.class, PathMap.class);
	}

	/**
	 * Please refer to the corresponding setter method for more calculation.
	 *
	 * @return a maximum number of paths that shall be calculated.
	 */
	public int getPathLimit() {
		return pathLimit;
	}

	/**
	 *
	 * This sets the k that gives the StrictKPathAlgorithm its name: The number
	 * of returned AlignmentPaths is limited to the k best alignments. The total
	 * number of possible AlignmentPaths is combinatorial in the input lengths.
	 * Thus we need to limit it to the k best. As soon as the limit is reached
	 * we decide randomly between equally good paths which one is taken.
	 *
	 * The default for this is DEFAULTPATHLIMIT.
	 *
	 * @param pathLimit a maximum number of paths that shall be calculated.
	 */
	public void setPathLimit(int pathLimit) {
		if (pathLimit < 1) {
			throw new IllegalArgumentException("You have to allow at least one path to be calculated!");
		}
		this.pathLimit = pathLimit;
	}

	/**
	 * {@inheritDoc }
	 */
	@Override
	public FullMatrixEntry createInitial() {
		return new FullMatrixEntry(0, 0, 0, 0, 0, 0);
	}

	/**
	 * {@inheritDoc }
	 */
	@Override
	public FullMatrixEntry createDelInitial(FullMatrixEntry delOld, int i, double delLocal) {
		return new FullMatrixEntry(delOld.getScore() + delLocal, delLocal, 0, 0, i, 0);
	}

	/**
	 * {@inheritDoc }
	 */
	@Override
	public FullMatrixEntry createInsInitial(FullMatrixEntry insOld, int j, double insLocal) {
		return new FullMatrixEntry(insOld.getScore() + insLocal, 0, insLocal, 0, 0, j);
	}

	/**
	 * {@inheritDoc }
	 */
	@Override
	public FullMatrixEntry createNewEntry(FullMatrixEntry delOld,
			FullMatrixEntry insOld,
			FullMatrixEntry repOld,
			int i, int j,
			double delLocal, double insLocal, double repLocal) {
		final double delTotal = delOld.getScore() + delLocal;
		final double insTotal = insOld.getScore() + insLocal;
		final double repTotal = repOld.getScore() + repLocal;
		if (delTotal < insTotal) {
			if (delTotal < repTotal) {
				return new FullMatrixEntry(delTotal, delLocal, insLocal, repLocal, i, j);
			} else {
				return new FullMatrixEntry(repTotal, delLocal, insLocal, repLocal, i, j);
			}
		} else {
			if (insTotal < repTotal) {
				return new FullMatrixEntry(insTotal, delLocal, insLocal, repLocal, i, j);
			} else {
				return new FullMatrixEntry(repTotal, delLocal, insLocal, repLocal, i, j);
			}
		}
	}

	/**
	 * {@inheritDoc }
	 */
	@Override
	public PathMap transformToResult(FullMatrixEntry[][] alignmentMatrix,
			Sequence a, Sequence b) {
		final int M = a.getNodes().size();
		final int N = b.getNodes().size();
		final double normalization = (double) Math.max(1, M + N);
		final AlignmentSpecification spec = getSpecification();

		final PathMap pathMap = new PathMap();

		final Random rand = new Random();

		final FullMatrixEntry last = alignmentMatrix[M][N];
		//Now backtrace to reconstruct the paths.
		final IndexedStochasticPriorityQueue queue
				= new IndexedStochasticPriorityQueue<>();
		queue.put(last.getScore(), new TmpPath(M, N, last.getScore()));
		TmpPath current;
		int pathCounter = 1;
		while (!queue.isEmpty()) {
			current = queue.pollFirst();
			//we are finished if we are at the start.
			if (current.i == 0 && current.j == 0) {
				//in that case we transform the temporary path to an actual one.
				final AlignmentPath path = new AlignmentPath(spec,
						a, b, current.getScore() / normalization);
				while (!current.opStack.empty()) {
					final TmpOp tmpOp = current.opStack.pop();
					final Operation op = new Operation(tmpOp.left, tmpOp.right,
							tmpOp.type, path);
					op.setComparatorDistances(tmpOp.local);
					path.getOperations().add(op);
				}
				pathMap.put(path);
				continue;
			}
			//otherwise we still have work to do.
			final Node left;
			if (current.i > 0) {
				left = a.getNodes().get(current.i - 1);
			} else {
				left = null;
			}
			final Node right;
			if (current.j > 0) {
				right = b.getNodes().get(current.j - 1);
			} else {
				right = null;
			}
			double[] local;
			//If we are in the first column we can only delete.
			if (current.j == 0) {
				local = getSpecification().calculateDeletionCosts(left);
				final TmpOp del = new TmpOp(left, null, local, spec,
						OperationType.DELETION);
				current.opStack.push(del);
				current.i--;
				queue.put(current.getScore(), current);
				continue;
			}
			//if we are in the first row we can only insert.
			if (current.i == 0) {
				local = getSpecification().calculateInsertionCosts(right);
				final TmpOp ins = new TmpOp(null, right, local, spec,
						OperationType.INSERTION);
				current.opStack.push(ins);
				current.j--;
				queue.put(current.getScore(), current);
				continue;
			}
			//if we are inside the matrix we consider all operations
			final FullMatrixEntry currentCell = alignmentMatrix[current.i][current.j];
			final FullMatrixEntry delOld = alignmentMatrix[current.i - 1][current.j];
			final FullMatrixEntry insOld = alignmentMatrix[current.i][current.j - 1];
			final FullMatrixEntry repOld = alignmentMatrix[current.i - 1][current.j - 1];

			final double scoreToEnd = current.actualScoreToEnd;

			//Determine the score a path may have at worst to be considered still.
			boolean useDeletion = false;
			boolean useInsertion = false;
			boolean useReplacement = false;
			//if we have enough space to explore all possibilites, do that
			if (pathCounter < pathLimit - 2) {
				useDeletion = true;
				useInsertion = true;
				useReplacement = true;
			} else {
				//Otherwise we determine, which path we want to continue.
				final double delScore = delOld.getScore()
						+ currentCell.getDelLocal()
						+ current.actualScoreToEnd;
				final double insScore = insOld.getScore()
						+ currentCell.getInsLocal()
						+ current.actualScoreToEnd;
				final double repScore = repOld.getScore()
						+ currentCell.getRepLocal()
						+ current.actualScoreToEnd;
				/*
				 * determine the rank of each operation by sorting them
				 * according
				 * to their score. We shuffle the list to ensure
				 * random choices of operations.
				 */
				final ArrayList ranking = new ArrayList<>();
				ranking.add(new SortEntry(OperationType.DELETION, delScore));
				ranking.add(new SortEntry(OperationType.INSERTION, insScore));
				ranking.add(new SortEntry(OperationType.REPLACEMENT, repScore));
				Collections.shuffle(ranking);
				Collections.sort(ranking);
				final int lastOp = pathLimit - pathCounter + 1;
				for (int o = 0; o < lastOp; o++) {
					switch (ranking.get(o).opType) {
						case DELETION:
							useDeletion = true;
							break;
						case INSERTION:
							useInsertion = true;
							break;
						case REPLACEMENT:
							useReplacement = true;
							break;
					}
				}
				double worstScore;
				if (!queue.isEmpty()) {
					worstScore = queue.lastKey();
				} else {
					//If the queue is empty, our shuffling before determined 
					// already which path gets continued.
					worstScore = -1;
				}
				/*
				 * We use all later operations only if they are better than our
				 * current worst score in the queue.
				 */
				for (int o = lastOp; o < 3; o++) {
					final SortEntry opEntry = ranking.get(o);
					if (opEntry.score < worstScore) {
						switch (opEntry.opType) {
							case DELETION:
								useDeletion = true;
								break;
							case INSERTION:
								useInsertion = true;
								break;
							case REPLACEMENT:
								useReplacement = true;
								break;
						}
						pathCounter--;
						queue.pollLast();
						//if we poll something we need to update the worst score.
						if (!queue.isEmpty()) {
							worstScore = queue.lastKey();
						} else {
							worstScore = -1;
						}
					} else if (opEntry.score == worstScore) {
						/*
						 * if the score is equal to the worst one, we roll a die
						 * to determine which path gets continued.
						 */
						final ArrayList worstPaths = queue.get(opEntry.score);
						final int kickIdx = rand.nextInt(worstPaths.size() + 1);
						if (kickIdx < worstPaths.size()) {
							pathCounter--;
							switch (opEntry.opType) {
								case DELETION:
									useDeletion = true;
									break;
								case INSERTION:
									useInsertion = true;
									break;
								case REPLACEMENT:
									useReplacement = true;
									break;
							}
							if (worstPaths.size() > 1) {
								worstPaths.remove(kickIdx);
							} else {
								queue.pollLastEntry();
								//if we poll something we need to update the worst score.
								if (!queue.isEmpty()) {
									worstScore = queue.lastKey();
								} else {
									worstScore = -1;
								}
							}
						}
					} else {
						//if we are larger than the worst score, we can stop
						//searching.
						break;
					}
				}

			}
			//continue the backtracing with a deletion
			if (useDeletion) {
				local = getSpecification().calculateDeletionCosts(left);
				final TmpOp del = new TmpOp(left, null, local, spec,
						OperationType.DELETION);
				current.continuePath(delOld.getScore(), del);
				final TmpPath delPath = current;
				queue.put(delPath.getScore(), delPath);
			}
			//continue the backtracing with an insertion 
			if (useInsertion) {
				local = getSpecification().calculateInsertionCosts(right);
				final TmpOp ins = new TmpOp(null, right, local, spec,
						OperationType.INSERTION);
				final TmpPath inspPath;
				if (!useDeletion) {
					current.continuePath(insOld.getScore(), ins);
					inspPath = current;
				} else {
					//if we already continued the current path, we have to create
					//a new TmpPath object.
					inspPath = new TmpPath(insOld.getScore(), scoreToEnd,
							ins, current, OperationType.DELETION);
					pathCounter++;
				}
				queue.put(inspPath.getScore(), inspPath);
			}
			//continue the backtracing with a replacement
			if (useReplacement) {
				local = getSpecification().calculateReplacementCosts(left, right);
				final TmpOp rep = new TmpOp(left, right, local, spec,
						OperationType.REPLACEMENT);
				final TmpPath repPath;
				if (!useDeletion && !useInsertion) {
					current.continuePath(repOld.getScore(), rep);
					repPath = current;
				} else {
					//if we already continued the current path, we have to create
					//a new TmpPath object.
					if (useDeletion) {
						repPath = new TmpPath(repOld.getScore(), scoreToEnd,
								rep, current, OperationType.DELETION);
					} else {
						repPath = new TmpPath(repOld.getScore(), scoreToEnd,
								rep, current, OperationType.INSERTION);
					}
					pathCounter++;
				}
				queue.put(repPath.getScore(), repPath);
			}
		}
		return pathMap;
	}

	private static class TmpPath {

		private Stack opStack = new Stack<>();
		private int i;
		private int j;
		private double optimalScoreToBegin;
		private double actualScoreToEnd;

		/**
		 * This is for initialization.
		 *
		 * @param M
		 * @param N
		 * @param optimalScore
		 */
		public TmpPath(int M, int N, double optimalScore) {
			this.i = M;
			this.j = N;
			this.optimalScoreToBegin = optimalScore;
		}

		/**
		 * This is for continuation.
		 *
		 * @param i
		 * @param j
		 * @param optimalScoreToBegin
		 * @param actualScoreToEnd
		 * @param local
		 * @param toEnd
		 * @param before
		 */
		public TmpPath(double optimalScoreToBegin,
				double actualScoreToEnd,
				TmpOp local,
				TmpPath toEnd, OperationType before) {
			int iOffset = 0;
			int jOffset = 0;
			switch (before) {
				case DELETION:
				case SKIPDELETION:
					iOffset++;
					break;
				case INSERTION:
				case SKIPINSERTION:
					jOffset++;
					break;
				case REPLACEMENT:
					iOffset++;
					jOffset++;
					break;
			}
			switch (local.type) {
				case DELETION:
				case SKIPDELETION:
					iOffset--;
					break;
				case INSERTION:
				case SKIPINSERTION:
					jOffset--;
					break;
				case REPLACEMENT:
					iOffset--;
					jOffset--;
					break;
			}
			this.i = toEnd.i + iOffset;
			this.j = toEnd.j + jOffset;
			this.optimalScoreToBegin = optimalScoreToBegin;
			this.actualScoreToEnd = actualScoreToEnd + local.getWeightedLocalCost();
			for (final TmpOp op : toEnd.opStack) {
				opStack.push(op);
			}
			opStack.pop();
			opStack.push(local);
		}

		public void continuePath(double newOptimalScoreToBegin, TmpOp newOp) {
			switch (newOp.type) {
				case DELETION:
				case SKIPDELETION:
					this.i--;
					break;
				case INSERTION:
				case SKIPINSERTION:
					this.j--;
					break;
				case REPLACEMENT:
					this.i--;
					this.j--;
					break;
			}
			this.optimalScoreToBegin = newOptimalScoreToBegin;
			this.opStack.add(newOp);
			this.actualScoreToEnd = actualScoreToEnd
					+ newOp.getWeightedLocalCost();
		}

		public double getScore() {
			return optimalScoreToBegin + actualScoreToEnd;
		}
	}

	private static class TmpOp {

		private final Node left;
		private final Node right;
		private final double[] local;
		private final AlignmentSpecification spec;
		private final OperationType type;

		public TmpOp(Node left, Node right, double[] local,
				AlignmentSpecification spec, OperationType type) {
			this.left = left;
			this.right = right;
			this.local = local;
			this.spec = spec;
			this.type = type;
		}

		public double getWeightedLocalCost() {
			final double[] weighting = spec.getWeighting();
			double out = 0;
			for (int k = 0; k < spec.size(); k++) {
				out += weighting[k] * local[k];
			}
			return out;
		}
	}

	private static class SortEntry implements Comparable {

		private final OperationType opType;
		private final double score;

		public SortEntry(OperationType opType, double score) {
			this.opType = opType;
			this.score = score;
		}

		@Override
		public int compareTo(SortEntry t) {
			return Double.compare(score, t.score);
		}

	}

	/**
	 * This is a helper class, which actually extends the TreeMap to implement a
	 * PriorityQueue that is able to poll the minimum as well as the maximum in
	 * constanst time.
	 *
	 * @author Benjamin Paassen - [email protected]
	 * @param  The Key Class
	 * @param  The Value Class
	 */
	public static class IndexedStochasticPriorityQueue, V>
			extends TreeMap> {

		private final Random rand = new Random();

		public IndexedStochasticPriorityQueue() {
		}

		public void put(final K key, final V value) {
			ArrayList list = super.get(key);
			if (list == null) {
				list = new ArrayList<>();
				super.put(key, list);
			}
			list.add(value);
		}

		/**
		 * Polls the value associated with the minimum key. If more than one
		 * element
		 * is associated with the minimum key, it returns a random element from
		 * that
		 * list.
		 *
		 * @return the value associated with the minimum key.
		 */
		public V pollFirst() {
			if (isEmpty()) {
				return null;
			}
			final ArrayList list = super.firstEntry().getValue();
			if (list.size() > 1) {
				final int pollIdx = rand.nextInt(list.size());
				return list.remove(pollIdx);
			} else {
				return super.pollFirstEntry().getValue().get(0);
			}
		}

		/**
		 * Polls the value associated with the maximum key. If more than one
		 * element
		 * is associated with the minimum key, it returns a random element from
		 * that
		 * list.
		 *
		 * @return the value associated with the maximum key.
		 */
		public V pollLast() {
			if (isEmpty()) {
				return null;
			}
			final ArrayList list = super.lastEntry().getValue();
			if (list.size() > 1) {
				final int pollIdx = rand.nextInt(list.size());
				return list.remove(pollIdx);
			} else {
				return super.pollLastEntry().getValue().get(0);
			}
		}
	}
}