net.maizegenetics.analysis.gbs.neobio.NeedlemanWunsch Maven / Gradle / Ivy
Show all versions of tassel Show documentation
/*
* NeedlemanWunsch.java
*
* Copyright 2003 Sergio Anibal de Carvalho Junior
*
* This file is part of NeoBio.
*
* NeoBio is free software; you can redistribute it and/or modify it under the terms of
* the GNU General Public License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with NeoBio;
* if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
* Boston, MA 02111-1307, USA.
*
* Proper attribution of the author as the source of the software would be appreciated.
*
* Sergio Anibal de Carvalho Junior mailto:[email protected]
* Department of Computer Science http://www.dcs.kcl.ac.uk
* King's College London, UK http://www.kcl.ac.uk
*
* Please visit http://neobio.sourceforge.net
*
* This project was supervised by Professor Maxime Crochemore.
*
*/
package net.maizegenetics.analysis.gbs.neobio;
import java.io.Reader;
import java.io.IOException;
/**
* This class implements the classic global alignment algorithm (with linear gap penalty
* function) due to S.B.Needleman and C.D.Wunsch (1970).
*
* It is based on a dynamic programming approach. The idea consists of, given two
* sequences A and B of sizes n and m, respectively, building an (n+1 x m+1) matrix M that
* contains the similarity of prefixes of A and B. Every position M[i,j] in the matrix
* holds the score between the subsequences A[1..i] and B[1..j]. The first row and column
* represent alignments with spaces.
*
* Starting from row 0, column 0, the algorithm computes each position M[i,j] with the
* following recurrence:
*
*
* M[0,0] = 0
* M[i,j] = max { M[i,j-1] + scoreInsertion (B[j]),
* M[i-1,j-1] + scoreSubstitution (A[i], B[j]),
* M[i-1,j] + scoreDeletion(A[i]) }
*
*
* In the end, the value at the last position (last row, last column) will contain
* the similarity between the two sequences. This part of the algorithm is accomplished
* by the {@link #computeMatrix computeMatrix} method. It has quadratic space complexity
* since it needs to keep an (n+1 x m+1) matrix in memory. And since the work of computing
* each cell is constant, it also has quadratic time complexity.
*
* After the matrix has been computed, the alignment can be retrieved by tracing a path
* back in the matrix from the last position to the first. This step is performed by
* the {@link #buildOptimalAlignment buildOptimalAlignment} method, and since the path can
* be roughly as long as (m + n), this method has O(n) time complexity.
*
* If the similarity value only is needed (and not the alignment itself), it is easy to
* reduce the space requirement to O(n) by keeping just the last row or column in memory.
* This is precisely what is done by the {@link #computeScore computeScore} method. Note
* that it still requires O(n2) time.
*
* For a more efficient approach to the global alignment problem, see the
* {@linkplain CrochemoreLandauZivUkelson} algorithm. For local alignment, see the
* {@linkplain SmithWaterman} algorithm.
*
* @author Sergio A. de Carvalho Jr.
* @see SmithWaterman
* @see CrochemoreLandauZivUkelson
* @see CrochemoreLandauZivUkelsonLocalAlignment
* @see CrochemoreLandauZivUkelsonGlobalAlignment
*/
public class NeedlemanWunsch extends PairwiseAlignmentAlgorithm
{
/**
* The first sequence of an alignment.
*/
protected CharSequence seq1;
/**
* The second sequence of an alignment.
*/
protected CharSequence seq2;
/**
* The dynamic programming matrix. Each position (i, j) represents the best score
* between the firsts i characters of seq1
and j characters of
* seq2
.
*/
protected int[][] matrix;
/**
* Loads sequences into {@linkplain CharSequence} instances. In case of any error,
* an exception is raised by the constructor of CharSequence
(please
* check the specification of that class for specific requirements).
*
* @param input1 Input for first sequence
* @param input2 Input for second sequence
* @throws IOException If an I/O error occurs when reading the sequences
* @throws InvalidSequenceException If the sequences are not valid
* @see CharSequence
*/
protected void loadSequencesInternal (Reader input1, Reader input2)
throws IOException, InvalidSequenceException
{
// load sequences into instances of CharSequence
this.seq1 = new CharSequence(input1);
this.seq2 = new CharSequence(input2);
}
/**
* Frees pointers to loaded sequences and the dynamic programming matrix so that their
* data can be garbage collected.
*/
protected void unloadSequencesInternal ()
{
this.seq1 = null;
this.seq2 = null;
this.matrix = null;
}
/**
* Builds an optimal global alignment between the loaded sequences after computing the
* dynamic programming matrix. It calls the buildOptimalAlignment
method
* after the computeMatrix
method computes the dynamic programming
* matrix.
*
* @return an optimal global alignment between the loaded sequences
* @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
* with the loaded sequences.
* @see #computeMatrix
* @see #buildOptimalAlignment
*/
protected PairwiseAlignment computePairwiseAlignment ()
throws IncompatibleScoringSchemeException
{
// compute the matrix
computeMatrix ();
// build and return an optimal global alignment
PairwiseAlignment alignment = buildOptimalAlignment ();
// allow the matrix to be garbage collected
matrix = null;
return alignment;
}
/**
* Computes the dynamic programming matrix.
*
* @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
* with the loaded sequences.
*/
protected void computeMatrix () throws IncompatibleScoringSchemeException
{
int r, c, rows, cols, ins, del, sub;
rows = seq1.length()+1;
cols = seq2.length()+1;
matrix = new int [rows][cols];
// initiate first row
matrix[0][0] = 0;
for (c = 1; c < cols; c++)
matrix[0][c] = matrix[0][c-1] + scoreInsertion(seq2.charAt(c));
// calculates the similarity matrix (row-wise)
for (r = 1; r < rows; r++)
{
// initiate first column
matrix[r][0] = matrix[r-1][0] + scoreDeletion(seq1.charAt(r));
for (c = 1; c < cols; c++)
{
ins = matrix[r][c-1] + scoreInsertion(seq2.charAt(c));
sub = matrix[r-1][c-1] + scoreSubstitution(seq1.charAt(r),seq2.charAt(c));
del = matrix[r-1][c] + scoreDeletion(seq1.charAt(r));
// choose the greatest
matrix[r][c] = max (ins, sub, del);
}
}
}
/**
* Builds an optimal global alignment between the loaded sequences. Before it is
* executed, the dynamic programming matrix must already have been computed by
* the computeMatrix
method.
*
* @return an optimal global alignment between the loaded sequences
* @throws IncompatibleScoringSchemeException If the scoring scheme
* is not compatible with the loaded sequences.
* @see #computeMatrix
*/
protected PairwiseAlignment buildOptimalAlignment ()
throws IncompatibleScoringSchemeException
{
StringBuffer gapped_seq1, score_tag_line, gapped_seq2;
int r, c, sub, max_score;
gapped_seq1 = new StringBuffer();
score_tag_line = new StringBuffer();
gapped_seq2 = new StringBuffer();
// start at the last row, last column
r = matrix.length - 1;
c = matrix[r].length - 1;
max_score = matrix[r][c];
while ((r > 0) || (c > 0))
{
if (c > 0)
if (matrix[r][c] == matrix[r][c-1] + scoreInsertion(seq2.charAt(c)))
{
// insertion was used
gapped_seq1.insert (0, GAP_CHARACTER);
score_tag_line.insert (0, GAP_TAG);
gapped_seq2.insert (0, seq2.charAt(c));
c = c - 1;
// skip to the next iteration
continue;
}
if ((r > 0) && (c > 0))
{
sub = scoreSubstitution(seq1.charAt(r), seq2.charAt(c));
if (matrix[r][c] == matrix[r-1][c-1] + sub)
{
// substitution was used
gapped_seq1.insert (0, seq1.charAt(r));
if (seq1.charAt(r) == seq2.charAt(c))
if (useMatchTag())
score_tag_line.insert (0, MATCH_TAG);
else
score_tag_line.insert (0, seq1.charAt(r));
else if (sub > 0)
score_tag_line.insert (0, APPROXIMATE_MATCH_TAG);
else
score_tag_line.insert (0, MISMATCH_TAG);
gapped_seq2.insert (0, seq2.charAt(c));
r = r - 1; c = c - 1;
// skip to the next iteration
continue;
}
}
// must be a deletion
gapped_seq1.insert (0, seq1.charAt(r));
score_tag_line.insert (0, GAP_TAG);
gapped_seq2.insert (0, GAP_CHARACTER);
r = r - 1;
}
return new PairwiseAlignment (gapped_seq1.toString(), score_tag_line.toString(),
gapped_seq2.toString(), max_score);
}
/**
* Computes the score of the best global alignment between the two sequences using the
* scoring scheme previously set. This method calculates the similarity value only
* (doesn't build the whole matrix so the alignment cannot be recovered, however it
* has the advantage of requiring O(n) space only).
*
* @return score of the best global alignment between the loaded sequences
* @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
* with the loaded sequences.
*/
protected int computeScore () throws IncompatibleScoringSchemeException
{
int[] array;
int r, c, rows, cols, tmp, ins, del, sub;
rows = seq1.length()+1;
cols = seq2.length()+1;
if (rows <= cols)
{
// goes columnwise
array = new int [rows];
// initiate first column
array[0] = 0;
for (r = 1; r < rows; r++)
array[r] = array[r-1] + scoreDeletion(seq1.charAt(r));
// calculate the similarity matrix (keep current column only)
for (c = 1; c < cols; c++)
{
// initiate first row (tmp hold values
// that will be later moved to the array)
tmp = array[0] + scoreInsertion(seq2.charAt(c));
for (r = 1; r < rows; r++)
{
ins = array[r] + scoreInsertion(seq2.charAt(c));
sub = array[r-1] + scoreSubstitution(seq1.charAt(r), seq2.charAt(c));
del = tmp + scoreDeletion(seq1.charAt(r));
// move the temp value to the array
array[r-1] = tmp;
// choose the greatest
tmp = max (ins, sub, del);
}
// move the temp value to the array
array[rows - 1] = tmp;
}
return array[rows - 1];
}
else
{
// goes rowwise
array = new int [cols];
// initiate first row
array[0] = 0;
for (c = 1; c < cols; c++)
array[c] = array[c-1] + scoreInsertion(seq2.charAt(c));
// calculate the similarity matrix (keep current row only)
for (r = 1; r < rows; r++)
{
// initiate first column (tmp hold values
// that will be later moved to the array)
tmp = array[0] + scoreDeletion(seq1.charAt(r));
for (c = 1; c < cols; c++)
{
ins = tmp + scoreInsertion(seq2.charAt(c));
sub = array[c-1] + scoreSubstitution(seq1.charAt(r), seq2.charAt(c));
del = array[c] + scoreDeletion(seq1.charAt(r));
// move the temp value to the array
array[c-1] = tmp;
// choose the greatest
tmp = max (ins, sub, del);
}
// move the temp value to the array
array[cols - 1] = tmp;
}
return array[cols - 1];
}
}
}