net.maizegenetics.analysis.gbs.neobio.NeedlemanWunsch Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of tassel Show documentation
TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage disequilibrium.
There is a newer version: 5.2.94
/*
 * NeedlemanWunsch.java
 *
 * Copyright 2003 Sergio Anibal de Carvalho Junior
 *
 * This file is part of NeoBio.
 *
 * NeoBio is free software; you can redistribute it and/or modify it under the terms of
 * the GNU General Public License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 * PURPOSE. See the GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along with NeoBio;
 * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
 * Boston, MA 02111-1307, USA.
 *
 * Proper attribution of the author as the source of the software would be appreciated.
 *
 * Sergio Anibal de Carvalho Junior		mailto:[email protected]
 * Department of Computer Science		http://www.dcs.kcl.ac.uk
 * King's College London, UK			http://www.kcl.ac.uk
 *
 * Please visit http://neobio.sourceforge.net
 *
 * This project was supervised by Professor Maxime Crochemore.
 *
 */

package net.maizegenetics.analysis.gbs.neobio;

import java.io.Reader;
import java.io.IOException;

/**
 * This class implements the classic global alignment algorithm (with linear gap penalty
 * function) due to S.B.Needleman and C.D.Wunsch (1970).
 *
 * It is based on a dynamic programming approach. The idea consists of, given two
 * sequences A and B of sizes n and m, respectively, building an (n+1 x m+1) matrix M that
 * contains the similarity of prefixes of A and B. Every position M[i,j] in the matrix
 * holds the score between the subsequences A[1..i] and B[1..j]. The first row and column
 * represent alignments with spaces.
 *
 * Starting from row 0, column 0, the algorithm computes each position M[i,j] with the
 * following recurrence:
 *
 *  * M[0,0] = 0
 * M[i,j] = max { M[i,j-1]   + scoreInsertion (B[j]),
 *                M[i-1,j-1] + scoreSubstitution (A[i], B[j]),
 *                M[i-1,j]   + scoreDeletion(A[i])             }
 * 
 *
 * In the end, the value at the last position (last row, last column) will contain
 * the similarity between the two sequences. This part of the algorithm is accomplished
 * by the {@link #computeMatrix computeMatrix} method. It has quadratic space complexity
 * since it needs to keep an (n+1 x m+1) matrix in memory. And since the work of computing
 * each cell is constant, it also has quadratic time complexity.
 *
 * After the matrix has been computed, the alignment can be retrieved by tracing a path
 * back in the matrix from the last position to the first. This step is performed by
 * the {@link #buildOptimalAlignment buildOptimalAlignment} method, and since the path can
 * be roughly as long as (m + n), this method has O(n) time complexity.
 *
 * If the similarity value only is needed (and not the alignment itself), it is easy to
 * reduce the space requirement to O(n) by keeping just the last row or column in memory.
 * This is precisely what is done by the {@link #computeScore computeScore} method. Note
 * that it still requires O(n²) time.
 *
 * For a more efficient approach to the global alignment problem, see the
 * {@linkplain CrochemoreLandauZivUkelson} algorithm. For local alignment, see the
 * {@linkplain SmithWaterman} algorithm.
 *
 * @author Sergio A. de Carvalho Jr.
 * @see SmithWaterman
 * @see CrochemoreLandauZivUkelson
 * @see CrochemoreLandauZivUkelsonLocalAlignment
 * @see CrochemoreLandauZivUkelsonGlobalAlignment
 */
public class NeedlemanWunsch extends PairwiseAlignmentAlgorithm
{
	/**
	 * The first sequence of an alignment.
	 */
	protected CharSequence seq1;

	/**
	 * The second sequence of an alignment.
	 */
	protected CharSequence seq2;

	/**
	 * The dynamic programming matrix. Each position (i, j) represents the best score
	 * between the firsts i characters of seq1 and j characters of
	 * seq2.
	 */
	protected int[][] matrix;

	/**
	 * Loads sequences into {@linkplain CharSequence} instances. In case of any error,
	 * an exception is raised by the constructor of CharSequence (please
	 * check the specification of that class for specific requirements).
	 *
	 * @param input1 Input for first sequence
	 * @param input2 Input for second sequence
	 * @throws IOException If an I/O error occurs when reading the sequences
	 * @throws InvalidSequenceException If the sequences are not valid
	 * @see CharSequence
	 */
	protected void loadSequencesInternal (Reader input1, Reader input2)
		throws IOException, InvalidSequenceException
	{
		// load sequences into instances of CharSequence
		this.seq1 = new CharSequence(input1);
		this.seq2 = new CharSequence(input2);
	}

	/**
	 * Frees pointers to loaded sequences and the dynamic programming matrix so that their
	 * data can be garbage collected.
	 */
	protected void unloadSequencesInternal ()
	{
		this.seq1 = null;
		this.seq2 = null;
		this.matrix = null;
	}

	/**
	 * Builds an optimal global alignment between the loaded sequences after computing the
	 * dynamic programming matrix. It calls the buildOptimalAlignment method
	 * after the computeMatrix method computes the dynamic programming
	 * matrix.
	 *
	 * @return an optimal global alignment between the loaded sequences
	 * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
	 * with the loaded sequences.
	 * @see #computeMatrix
	 * @see #buildOptimalAlignment
	 */
	protected PairwiseAlignment computePairwiseAlignment ()
		throws IncompatibleScoringSchemeException
	{
		// compute the matrix
		computeMatrix ();

		// build and return an optimal global alignment
		PairwiseAlignment alignment = buildOptimalAlignment ();

		// allow the matrix to be garbage collected
		matrix = null;

		return alignment;
	}

	/**
	 * Computes the dynamic programming matrix.
	 *
	 * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
	 * with the loaded sequences.
	 */
	protected void computeMatrix () throws IncompatibleScoringSchemeException
	{
		int r, c, rows, cols, ins, del, sub;

		rows = seq1.length()+1;
		cols = seq2.length()+1;

		matrix = new int [rows][cols];

		// initiate first row
		matrix[0][0] = 0;
		for (c = 1; c < cols; c++)
			matrix[0][c] = matrix[0][c-1] + scoreInsertion(seq2.charAt(c));

		// calculates the similarity matrix (row-wise)
		for (r = 1; r < rows; r++)
		{
			// initiate first column
			matrix[r][0] = matrix[r-1][0] + scoreDeletion(seq1.charAt(r));

			for (c = 1; c < cols; c++)
			{
				ins = matrix[r][c-1] + scoreInsertion(seq2.charAt(c));
				sub = matrix[r-1][c-1] + scoreSubstitution(seq1.charAt(r),seq2.charAt(c));
				del = matrix[r-1][c] + scoreDeletion(seq1.charAt(r));

				// choose the greatest
				matrix[r][c] = max (ins, sub, del);
			}
		}
	}

	/**
	 * Builds an optimal global alignment between the loaded sequences. Before it is
	 * executed, the dynamic programming matrix must already have been computed by
	 * the computeMatrix method.
	 *
	 * @return an optimal global alignment between the loaded sequences
	 * @throws IncompatibleScoringSchemeException If the scoring scheme
	 * is not compatible with the loaded sequences.
	 * @see #computeMatrix
	 */
	protected PairwiseAlignment buildOptimalAlignment ()
		throws IncompatibleScoringSchemeException
	{
		StringBuffer	gapped_seq1, score_tag_line, gapped_seq2;
		int				r, c, sub, max_score;

		gapped_seq1 	= new StringBuffer();
		score_tag_line	= new StringBuffer();
		gapped_seq2 	= new StringBuffer();

		// start at the last row, last column
		r = matrix.length - 1;
		c = matrix[r].length - 1;
		max_score = matrix[r][c];

		while ((r > 0) || (c > 0))
		{
			if (c > 0)
				if (matrix[r][c] == matrix[r][c-1] + scoreInsertion(seq2.charAt(c)))
				{
					// insertion was used
					gapped_seq1.insert (0, GAP_CHARACTER);
					score_tag_line.insert (0, GAP_TAG);
					gapped_seq2.insert (0, seq2.charAt(c));
					c = c - 1;

					// skip to the next iteration
					continue;
				}

			if ((r > 0) && (c > 0))
			{
				sub = scoreSubstitution(seq1.charAt(r), seq2.charAt(c));

				if (matrix[r][c] == matrix[r-1][c-1] + sub)
				{
					// substitution was used
					gapped_seq1.insert (0, seq1.charAt(r));
					if (seq1.charAt(r) == seq2.charAt(c))
						if (useMatchTag())
							score_tag_line.insert (0, MATCH_TAG);
						else
							score_tag_line.insert (0, seq1.charAt(r));
					else if (sub > 0)
						score_tag_line.insert (0, APPROXIMATE_MATCH_TAG);
					else
						score_tag_line.insert (0, MISMATCH_TAG);
					gapped_seq2.insert (0, seq2.charAt(c));
					r = r - 1; c = c - 1;

					// skip to the next iteration
					continue;
				}
			}

			// must be a deletion
			gapped_seq1.insert (0, seq1.charAt(r));
			score_tag_line.insert (0, GAP_TAG);
			gapped_seq2.insert (0, GAP_CHARACTER);
			r = r - 1;
		}

		return new PairwiseAlignment (gapped_seq1.toString(), score_tag_line.toString(),
										gapped_seq2.toString(), max_score);
	}

	/**
	 * Computes the score of the best global alignment between the two sequences using the
	 * scoring scheme previously set. This method calculates the similarity value only
	 * (doesn't build the whole matrix so the alignment cannot be recovered, however it
	 * has the advantage of requiring O(n) space only).
	 *
	 * @return score of the best global alignment between the loaded sequences
	 * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
	 * with the loaded sequences.
	 */
	protected int computeScore () throws IncompatibleScoringSchemeException
	{
		int[]	array;
		int		r, c, rows, cols, tmp, ins, del, sub;

		rows = seq1.length()+1;
		cols = seq2.length()+1;

		if (rows <= cols)
		{
			// goes columnwise
			array = new int [rows];

			// initiate first column
			array[0] = 0;
			for (r = 1; r < rows; r++)
				array[r] = array[r-1] + scoreDeletion(seq1.charAt(r));

			// calculate the similarity matrix (keep current column only)
			for (c = 1; c < cols; c++)
			{
				// initiate first row (tmp hold values
				// that will be later moved to the array)
				tmp = array[0] + scoreInsertion(seq2.charAt(c));

				for (r = 1; r < rows; r++)
				{
					ins = array[r] + scoreInsertion(seq2.charAt(c));
					sub = array[r-1] + scoreSubstitution(seq1.charAt(r), seq2.charAt(c));
					del = tmp + scoreDeletion(seq1.charAt(r));

					// move the temp value to the array
					array[r-1] = tmp;

					// choose the greatest
					tmp = max (ins, sub, del);
				}

				// move the temp value to the array
				array[rows - 1] = tmp;
			}

			return array[rows - 1];
		}
		else
		{
			// goes rowwise
			array = new int [cols];

			// initiate first row
			array[0] = 0;
			for (c = 1; c < cols; c++)
				array[c] = array[c-1] + scoreInsertion(seq2.charAt(c));

			// calculate the similarity matrix (keep current row only)
			for (r = 1; r < rows; r++)
			{
				// initiate first column (tmp hold values
				// that will be later moved to the array)
				tmp = array[0] + scoreDeletion(seq1.charAt(r));

				for (c = 1; c < cols; c++)
				{
					ins = tmp + scoreInsertion(seq2.charAt(c));
					sub = array[c-1] + scoreSubstitution(seq1.charAt(r), seq2.charAt(c));
					del = array[c] + scoreDeletion(seq1.charAt(r));

					// move the temp value to the array
					array[c-1] = tmp;

					// choose the greatest
					tmp = max (ins, sub, del);
				}

				// move the temp value to the array
				array[cols - 1] = tmp;
			}

			return array[cols - 1];
		}
	}
}