net.maizegenetics.analysis.gbs.SmithWaterman Maven / Gradle / Ivy
package net.maizegenetics.analysis.gbs;
/**
* Created by IntelliJ IDEA.
* User: ed
* Date: May 31, 2008
* Time: 9:35:16 AM
* To change this template use File | Settings | File Templates.
*/
/*
* SmithWaterman.java
*
* Copyright 2003 Sergio Anibal de Carvalho Junior
*
* This file is part of NeoBio.
*
* NeoBio is free software; you can redistribute it and/or modify it under the terms of
* the GNU General Public License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with NeoBio;
* if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
* Boston, MA 02111-1307, USA.
*
* Proper attribution of the author as the source of the software would be appreciated.
*
* Sergio Anibal de Carvalho Junior mailto:[email protected]
* Department of Computer Science http://www.dcs.kcl.ac.uk
* King's College London, UK http://www.kcl.ac.uk
*
* Please visit http://neobio.sourceforge.net
*
* This project was supervised by Professor Maxime Crochemore.
*
*/
import java.util.Arrays;
import org.biojava.nbio.alignment.NeedlemanWunsch;
/**
* This class implement the classic local alignment algorithm (with linear gap penalty
* function) due to T.F.Smith and M.S.Waterman (1981).
*
* This algorithm is very similar to the {@linkplain NeedlemanWunsch} algorithm for
* global alignment. The idea here also consists of building an (n+1 x m+1) matrix M given
* two sequences A and B of sizes n and m, respectively. However, unlike in the global
* alignment case, every position M[i,j] in the matrix contains the similarity score of
* suffixes of A[1..i] and B[1..j].
*
* Starting from row 0, column 0, the {@link #computeMatrix computeMatrix} method
* computes each position M[i,j] with the following recurrence:
*
*
* M[0,0] = M[0,j] = M[i,0] = 0
* M[i,j] = max { M[i,j-1] + scoreInsertion (B[j]),
* M[i-1,j-1] + scoreSubstitution (A[i], B[j]),
* M[i-1,j] + scoreDeletion(A[i]) }
*
*
* Note that, here, all cells in the first row and column are set to zero. The best
* local alignment score is the highest value found anywhere in the matrix.
*
* Just like in global alignment case, this algorithm has quadratic space complexity
* because it needs to keep an (n+1 x m+1) matrix in memory. And since the work of
* computing each cell is constant, it also has quadratic time complexity.
*
* After the matrix has been computed, the alignment can be retrieved by tracing a path
* back in the matrix from the position of the highest score until a cell of value zero is
* reached. This step is performed by the {
* buildOptimalAlignment} method, and its time complexity is linear on the size of the
* alignment.
*
*
If the similarity value only is needed (and not the alignment itself), it is easy to
* reduce the space requirement to O(n) by keeping just the last row or column in memory.
* This is precisely what is done by the {@link #computeScore computeScore} method. Note
* that it still requires O(n2) time.
*
* For a more efficient approach to the local alignment problem, see the
* {@linkplain CrochemoreLandauZivUkelson} algorithm. For global alignment, see the
* {@linkplain NeedlemanWunsch} algorithm.
*
* @author Sergio A. de Carvalho Jr.
*/
public class SmithWaterman {
/**
* The first sequence of an alignment.
*/
protected String seq1 = "CGGGTGTGACAGTCGTGCAGTCGACCGTTGGG";
/**
* The second sequence of an alignment.
*/
protected String seq2 = "XXXXXCGGGTGTGACAGTCGTGCAGTCGACCGTTGGGXXXXXXX";
// protected String seq2="CGGGTGTGACAGTCGTGCAGTCGACCGTTGGG";
/**
* The dynamic programming matrix. Each position (i, j) represents the best score
* between a suffic of the firsts i characters of seq1
and a suffix of
* the first j characters of seq2
.
*/
protected int[][] matrix;
/**
* Indicate the row of where an optimal local alignment can be found in the matrix..
*/
protected int max_row;
/**
* Indicate the column of where an optimal local alignment can be found in the matrix.
*/
protected int max_col;
int rows, cols;
byte[] bseq1, bseq2;
int[] array;
public SmithWaterman() {
long time = System.currentTimeMillis();
int maxScore = 0;
rows = seq1.length() + 1;
cols = seq2.length() + 1;
if (rows <= cols) {
array = new int[rows];
} else {
array = new int[cols];
}
bseq1 = seq1.getBytes();
bseq2 = seq2.getBytes();
// matrix = new int[rows][cols];
for (int i = 0; i < 30000; i++) {
// maxScore=computeMatrix();
maxScore = computeScore();
}
// System.out.println(Arrays.deepToString(matrix));
//System.out.println("MaxScore:" + maxScore);
//System.out.println("time:" + (System.currentTimeMillis() - time));
}
public SmithWaterman(int rows, int cols) {
this.rows = rows + 1;
this.cols = cols + 1;
int max = (rows > cols) ? rows : cols;
array = new int[max];
// if (rows <= cols) {array = new int [this.rows];}
// else{array = new int [this.cols];}
}
public SmithWaterman(byte[] b1, byte[] b2) {
bseq1 = b1;
bseq2 = b2;
this.rows = b1.length + 1;
this.cols = b2.length + 1;
int max = (rows > cols) ? rows : cols;
array = new int[max];
}
int computeScore(byte[] b1, byte[] b2) {
bseq1 = b1;
bseq2 = b2;
this.rows = b1.length + 1;
this.cols = b2.length + 1;
return computeScore();
}
/**
* Computes the dynamic programming matrix.
*
* If the scoring scheme is not compatible
* with the loaded sequences.
*/
protected int computeMatrix() {
int r, c, ins, sub, del, max_score;
long time = System.currentTimeMillis();
// initiate first row
for (c = 0; c < cols; c++) {
matrix[0][c] = 0;
}
// keep track of the maximum score
this.max_row = this.max_col = max_score = 0;
// calculates the similarity matrix (row-wise)
for (r = 1; r < rows; r++) {
// initiate first column
matrix[r][0] = 0;
for (c = 1; c < cols; c++) {
// ins = matrix[r][c-1] + scoreInsertion(seq2.charAt(c));
// sub = matrix[r-1][c-1] + scoreSubstitution(seq1.charAt(r),seq2.charAt(c));
// del = matrix[r-1][c] + scoreDeletion(seq1.charAt(r));
ins = matrix[r][c - 1] - 1;
// sub = matrix[r-1][c-1] + scoreSubstitution(seq1.charAt(r-1),seq2.charAt(c-1));
sub = matrix[r - 1][c - 1] + (bseq1[r - 1] == bseq2[c - 1] ? 2 : 0);
del = matrix[r - 1][c] - 1;
// choose the greatest
matrix[r][c] = max(ins, sub, del, 0);
if (matrix[r][c] > max_score) {
// keep track of the maximum score
max_score = matrix[r][c];
this.max_row = r;
this.max_col = c;
}
}
}
System.out.println("MaxScore:" + max_score);
System.out.println("time:" + (System.currentTimeMillis() - time));
return max_score;
}
/**
* Computes the score of the best local alignment between the two sequences using the
* scoring scheme previously set. This method calculates the similarity value only
* (doesn't build the whole matrix so the alignment cannot be recovered, however it
* has the advantage of requiring O(n) space only).
*
* @return the score of the best local alignment between the loaded sequences
* with the loaded sequences.
*/
public int computeScore() {
// int[] array;
// int rows = seq1.length()+1, cols = seq2.length()+1;
int r, c, tmp, ins, del, sub, max_score;
long time = System.currentTimeMillis();
// keep track of the maximum score
max_score = 0;
if (rows <= cols) {
// // goes columnwise
// array = new int [rows];
// initiate first column
for (r = 0; r < rows; r++) {
array[r] = 0;
}
// calculate the similarity matrix (keep current column only)
for (c = 1; c < cols; c++) {
// set first position to zero (tmp hold values
// that will be later moved to the array)
tmp = 0;
for (r = 1; r < rows; r++) {
ins = array[r] - 1;
// if(r>30) {
// System.out.println();
// }
sub = array[r - 1] + (bseq1[r - 1] == bseq2[c - 1] ? 2 : 0);
del = tmp - 1;
// move the temp value to the array
array[r - 1] = tmp;
// choose the greatest (or zero if all negative)
tmp = max(ins, sub, del, 0);
// keep track of the maximum score
if (tmp > max_score) {
max_score = tmp;
}
}
// move the temp value to the array
array[rows - 1] = tmp;
}
} else {
// goes rowwise
// array = new int [cols];
// initiate first row
for (c = 0; c < cols; c++) {
array[c] = 0;
}
// calculate the similarity matrix (keep current row only)
for (r = 1; r < rows; r++) {
// set first position to zero (tmp hold values
// that will be later moved to the array)
tmp = 0;
for (c = 1; c < cols; c++) {
ins = tmp - 1;
sub = array[c - 1] + (bseq1[r - 1] == bseq2[c - 1] ? 2 : 0);
del = array[c] - 1;
// move the temp value to the array
array[c - 1] = tmp;
// choose the greatest (or zero if all negative)
tmp = max(ins, sub, del, 0);
// keep track of the maximum score
if (tmp > max_score) {
max_score = tmp;
}
}
// move the temp value to the array
array[cols - 1] = tmp;
}
}
// System.out.println("SmithWaterman: MaxScore:" + max_score + " time:" + (System.currentTimeMillis() - time));
return max_score;
}
protected final int scoreInsertion(char a) {
return -1;
}
protected final int scoreSubstitution(char a, char b) {
if (a == b) {
return 2;
}
return 0;
}
protected final int scoreDeletion(char a) {
return -1;
}
/**
* Helper method to compute the the greater of two values.
*
* @param v1 first value
* @param v2 second value
* @return the larger of v1
and v2
*/
protected final int max(int v1, int v2) {
return (v1 >= v2) ? v1 : v2;
}
/**
* Helper method to compute the the greater of three values.
*
* @param v1 first value
* @param v2 second value
* @param v3 third value
* @return the larger of v1
, v2
and v3
*/
protected final int max(int v1, int v2, int v3) {
return (v1 >= v2) ? ((v1 >= v3) ? v1 : v3) : ((v2 >= v3) ? v2 : v3);
}
/**
* Helper method to compute the the greater of four values.
*
* @param v1 first value
* @param v2 second value
* @param v3 third value
* @param v4 fourth value
* @return the larger of v1
, v2
v3
and
* v4
*/
protected final int max(int v1, int v2, int v3, int v4) {
int m1 = ((v1 >= v2) ? v1 : v2);
int m2 = ((v3 >= v4) ? v3 : v4);
return (m1 >= m2) ? m1 : m2;
}
}