All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.analysis.gbs.neobio.SmithWaterman Maven / Gradle / Ivy

/*
 * SmithWaterman.java
 *
 * Copyright 2003 Sergio Anibal de Carvalho Junior
 *
 * This file is part of NeoBio.
 *
 * NeoBio is free software; you can redistribute it and/or modify it under the terms of
 * the GNU General Public License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 * PURPOSE. See the GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along with NeoBio;
 * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
 * Boston, MA 02111-1307, USA.
 *
 * Proper attribution of the author as the source of the software would be appreciated.
 *
 * Sergio Anibal de Carvalho Junior		mailto:[email protected]
 * Department of Computer Science		http://www.dcs.kcl.ac.uk
 * King's College London, UK			http://www.kcl.ac.uk
 *
 * Please visit http://neobio.sourceforge.net
 *
 * This project was supervised by Professor Maxime Crochemore.
 *
 */

package net.maizegenetics.analysis.gbs.neobio;

import java.io.Reader;
import java.io.IOException;

/**
 * This class implement the classic local alignment algorithm (with linear gap penalty
 * function) due to T.F.Smith and M.S.Waterman (1981).
 *
 * 

This algorithm is very similar to the {@linkplain NeedlemanWunsch} algorithm for * global alignment. The idea here also consists of building an (n+1 x m+1) matrix M given * two sequences A and B of sizes n and m, respectively. However, unlike in the global * alignment case, every position M[i,j] in the matrix contains the similarity score of * suffixes of A[1..i] and B[1..j].

* *

Starting from row 0, column 0, the {@link #computeMatrix computeMatrix} method * computes each position M[i,j] with the following recurrence:

* *
 * M[0,0] = M[0,j] = M[i,0] = 0
 * M[i,j] = max { M[i,j-1]   + scoreInsertion (B[j]),
 *                M[i-1,j-1] + scoreSubstitution (A[i], B[j]),
 *                M[i-1,j]   + scoreDeletion(A[i])             }
 * 
* *

Note that, here, all cells in the first row and column are set to zero. The best * local alignment score is the highest value found anywhere in the matrix.

* *

Just like in global alignment case, this algorithm has quadratic space complexity * because it needs to keep an (n+1 x m+1) matrix in memory. And since the work of * computing each cell is constant, it also has quadratic time complexity.

* *

After the matrix has been computed, the alignment can be retrieved by tracing a path * back in the matrix from the position of the highest score until a cell of value zero is * reached. This step is performed by the {@link #buildOptimalAlignment * buildOptimalAlignment} method, and its time complexity is linear on the size of the * alignment. * *

If the similarity value only is needed (and not the alignment itself), it is easy to * reduce the space requirement to O(n) by keeping just the last row or column in memory. * This is precisely what is done by the {@link #computeScore computeScore} method. Note * that it still requires O(n2) time.

* *

For a more efficient approach to the local alignment problem, see the * {@linkplain CrochemoreLandauZivUkelson} algorithm. For global alignment, see the * {@linkplain NeedlemanWunsch} algorithm.

* * @author Sergio A. de Carvalho Jr. * @see NeedlemanWunsch * @see CrochemoreLandauZivUkelson * @see CrochemoreLandauZivUkelsonLocalAlignment * @see CrochemoreLandauZivUkelsonGlobalAlignment */ public class SmithWaterman extends PairwiseAlignmentAlgorithm { /** * The first sequence of an alignment. */ protected CharSequence seq1; /** * The second sequence of an alignment. */ protected CharSequence seq2; /** * The dynamic programming matrix. Each position (i, j) represents the best score * between a suffic of the firsts i characters of seq1 and a suffix of * the first j characters of seq2. */ protected int[][] matrix; /** * Indicate the row of where an optimal local alignment can be found in the matrix.. */ protected int max_row; /** * Indicate the column of where an optimal local alignment can be found in the matrix. */ protected int max_col; /** * Loads sequences into {@linkplain CharSequence} instances. In case of any error, an * exception is raised by the constructor of CharSequence (please check * the specification of that class for specific requirements). * * @param input1 Input for first sequence * @param input2 Input for second sequence * @throws IOException If an I/O error occurs when reading the sequences * @throws InvalidSequenceException If the sequences are not valid * @see CharSequence */ protected void loadSequencesInternal (Reader input1, Reader input2) throws IOException, InvalidSequenceException { // load sequences into instances of CharSequence this.seq1 = new CharSequence(input1); this.seq2 = new CharSequence(input2); } /** * Frees pointers to loaded sequences and the dynamic programming matrix so that their * data can be garbage collected. */ protected void unloadSequencesInternal () { this.seq1 = null; this.seq2 = null; this.matrix = null; } /** * Builds an optimal local alignment between the loaded sequences after computing the * dynamic programming matrix. It calls the buildOptimalAlignment method * after the computeMatrix method computes the dynamic programming * matrix. * * @return an optimal pairwise alignment between the loaded sequences * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible * with the loaded sequences. * @see #computeMatrix * @see #buildOptimalAlignment */ protected PairwiseAlignment computePairwiseAlignment () throws IncompatibleScoringSchemeException { // compute the matrix computeMatrix (); // build and return an optimal local alignment PairwiseAlignment alignment = buildOptimalAlignment (); // allow the matrix to be garbage collected matrix = null; return alignment; } /** * Computes the dynamic programming matrix. * * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible * with the loaded sequences. */ protected void computeMatrix () throws IncompatibleScoringSchemeException { int r, c, rows, cols, ins, sub, del, max_score; rows = seq1.length()+1; cols = seq2.length()+1; matrix = new int [rows][cols]; // initiate first row for (c = 0; c < cols; c++) matrix[0][c] = 0; // keep track of the maximum score this.max_row = this.max_col = max_score = 0; // calculates the similarity matrix (row-wise) for (r = 1; r < rows; r++) { // initiate first column matrix[r][0] = 0; for (c = 1; c < cols; c++) { ins = matrix[r][c-1] + scoreInsertion(seq2.charAt(c)); sub = matrix[r-1][c-1] + scoreSubstitution(seq1.charAt(r),seq2.charAt(c)); del = matrix[r-1][c] + scoreDeletion(seq1.charAt(r)); // choose the greatest matrix[r][c] = max (ins, sub, del, 0); if (matrix[r][c] > max_score) { // keep track of the maximum score max_score = matrix[r][c]; this.max_row = r; this.max_col = c; } } } } /** * Builds an optimal local alignment between the loaded sequences. Before it is * executed, the dynamic programming matrix must already have been computed by * the computeMatrix method. * * @return an optimal local alignment between the loaded sequences * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible * with the loaded sequences. * @see #computeMatrix */ protected PairwiseAlignment buildOptimalAlignment () throws IncompatibleScoringSchemeException { StringBuffer gapped_seq1, score_tag_line, gapped_seq2; int r, c, max_score, sub; // start at the cell with maximum score r = this.max_row; c = this.max_col; max_score = matrix[r][c]; gapped_seq1 = new StringBuffer(); score_tag_line = new StringBuffer(); gapped_seq2 = new StringBuffer(); while ((r > 0 || c > 0) && (matrix[r][c] > 0)) { if (c > 0) if (matrix[r][c] == matrix[r][c-1] + scoreInsertion(seq2.charAt(c))) { // insertion gapped_seq1.insert (0, GAP_CHARACTER); score_tag_line.insert (0, GAP_TAG); gapped_seq2.insert (0, seq2.charAt(c)); c = c - 1; // skip to the next iteration continue; } if ((r > 0) && (c > 0)) { sub = scoreSubstitution(seq1.charAt(r), seq2.charAt(c)); if (matrix[r][c] == matrix[r-1][c-1] + sub) { // substitution gapped_seq1.insert (0, seq1.charAt(r)); if (seq1.charAt(r) == seq2.charAt(c)) if (useMatchTag()) score_tag_line.insert (0, MATCH_TAG); else score_tag_line.insert (0, seq1.charAt(r)); else if (sub > 0) score_tag_line.insert (0, APPROXIMATE_MATCH_TAG); else score_tag_line.insert (0, MISMATCH_TAG); gapped_seq2.insert (0, seq2.charAt(c)); r = r - 1; c = c - 1; // skip to the next iteration continue; } } // must be a deletion gapped_seq1.insert (0, seq1.charAt(r)); score_tag_line.insert (0, GAP_TAG); gapped_seq2.insert (0,GAP_CHARACTER); r = r - 1; } return new PairwiseAlignment (gapped_seq1.toString(), score_tag_line.toString(), gapped_seq2.toString(), max_score,r,c); } /** * Computes the score of the best local alignment between the two sequences using the * scoring scheme previously set. This method calculates the similarity value only * (doesn't build the whole matrix so the alignment cannot be recovered, however it * has the advantage of requiring O(n) space only). * * @return the score of the best local alignment between the loaded sequences * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible * with the loaded sequences. */ protected int computeScore () throws IncompatibleScoringSchemeException { int[] array; int rows = seq1.length()+1, cols = seq2.length()+1; int r, c, tmp, ins, del, sub, max_score; // keep track of the maximum score max_score = 0; if (rows <= cols) { // goes columnwise array = new int [rows]; // initiate first column for (r = 0; r < rows; r++) array[r] = 0; // calculate the similarity matrix (keep current column only) for (c = 1; c < cols; c++) { // set first position to zero (tmp hold values // that will be later moved to the array) tmp = 0; for (r = 1; r < rows; r++) { ins = array[r] + scoreInsertion(seq2.charAt(c)); sub = array[r-1] + scoreSubstitution(seq1.charAt(r), seq2.charAt(c)); del = tmp + scoreDeletion(seq1.charAt(r)); // move the temp value to the array array[r-1] = tmp; // choose the greatest (or zero if all negative) tmp = max (ins, sub, del, 0); // keep track of the maximum score if (tmp > max_score) max_score = tmp; } // move the temp value to the array array[rows - 1] = tmp; } } else { // goes rowwise array = new int [cols]; // initiate first row for (c = 0; c < cols; c++) array[c] = 0; // calculate the similarity matrix (keep current row only) for (r = 1; r < rows; r++) { // set first position to zero (tmp hold values // that will be later moved to the array) tmp = 0; for (c = 1; c < cols; c++) { ins = tmp + scoreInsertion(seq2.charAt(c)); sub = array[c-1] + scoreSubstitution(seq1.charAt(r), seq2.charAt(c)); del = array[c] + scoreDeletion(seq1.charAt(r)); // move the temp value to the array array[c-1] = tmp; // choose the greatest (or zero if all negative) tmp = max (ins, sub, del, 0); // keep track of the maximum score if (tmp > max_score) max_score = tmp; } // move the temp value to the array array[cols - 1] = tmp; } } return max_score; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy